From aa852bbe67a7dc9018afd7d1349184f0284d215c Mon Sep 17 00:00:00 2001
From: CNugteren <web@cedricnugteren.nl>
Date: Sun, 12 Jul 2015 16:57:09 +0200
Subject: Added subfolders for the level1/2/3 routines

---
 src/routines/level1/xaxpy.cc  | 115 +++++++++++++++++++++++++++
 src/routines/level2/xgemv.cc  | 146 ++++++++++++++++++++++++++++++++++
 src/routines/level3/xgemm.cc  | 172 ++++++++++++++++++++++++++++++++++++++++
 src/routines/level3/xhemm.cc  | 130 ++++++++++++++++++++++++++++++
 src/routines/level3/xher2k.cc | 178 ++++++++++++++++++++++++++++++++++++++++++
 src/routines/level3/xherk.cc  | 156 ++++++++++++++++++++++++++++++++++++
 src/routines/level3/xsymm.cc  | 132 +++++++++++++++++++++++++++++++
 src/routines/level3/xsyr2k.cc | 166 +++++++++++++++++++++++++++++++++++++++
 src/routines/level3/xsyrk.cc  | 147 ++++++++++++++++++++++++++++++++++
 src/routines/level3/xtrmm.cc  | 135 ++++++++++++++++++++++++++++++++
 src/routines/xaxpy.cc         | 115 ---------------------------
 src/routines/xgemm.cc         | 172 ----------------------------------------
 src/routines/xgemv.cc         | 146 ----------------------------------
 src/routines/xhemm.cc         | 130 ------------------------------
 src/routines/xher2k.cc        | 178 ------------------------------------------
 src/routines/xherk.cc         | 156 ------------------------------------
 src/routines/xsymm.cc         | 132 -------------------------------
 src/routines/xsyr2k.cc        | 166 ---------------------------------------
 src/routines/xsyrk.cc         | 147 ----------------------------------
 src/routines/xtrmm.cc         | 135 --------------------------------
 20 files changed, 1477 insertions(+), 1477 deletions(-)
 create mode 100644 src/routines/level1/xaxpy.cc
 create mode 100644 src/routines/level2/xgemv.cc
 create mode 100644 src/routines/level3/xgemm.cc
 create mode 100644 src/routines/level3/xhemm.cc
 create mode 100644 src/routines/level3/xher2k.cc
 create mode 100644 src/routines/level3/xherk.cc
 create mode 100644 src/routines/level3/xsymm.cc
 create mode 100644 src/routines/level3/xsyr2k.cc
 create mode 100644 src/routines/level3/xsyrk.cc
 create mode 100644 src/routines/level3/xtrmm.cc
 delete mode 100644 src/routines/xaxpy.cc
 delete mode 100644 src/routines/xgemm.cc
 delete mode 100644 src/routines/xgemv.cc
 delete mode 100644 src/routines/xhemm.cc
 delete mode 100644 src/routines/xher2k.cc
 delete mode 100644 src/routines/xherk.cc
 delete mode 100644 src/routines/xsymm.cc
 delete mode 100644 src/routines/xsyr2k.cc
 delete mode 100644 src/routines/xsyrk.cc
 delete mode 100644 src/routines/xtrmm.cc

(limited to 'src/routines')
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
new file mode 100644
index 00000000..fba36851
--- /dev/null
+++ b/src/routines/level1/xaxpy.cc
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xaxpy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level1/xaxpy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xaxpy"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Tests the vectors for validity
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
+                         (y_offset == 0) && (y_inc == 1) &&
+                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+
+  // If possible, run the fast-version of the kernel
+  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+
+  // Retrieves the Xaxpy kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, y_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(n));
+      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(2, x_buffer());
+      kernel.SetArgument(3, static_cast<int>(x_offset));
+      kernel.SetArgument(4, static_cast<int>(x_inc));
+      kernel.SetArgument(5, y_buffer());
+      kernel.SetArgument(6, static_cast<int>(y_offset));
+      kernel.SetArgument(7, static_cast<int>(y_inc));
+    }
+
+    // Launches the kernel
+    if (use_fast_kernel) {
+      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    else {
+      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
+      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
+      auto local = std::vector<size_t>{db_["WGS"]};
+      status = RunKernel(kernel, global, local);
+    }
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xaxpy<float>;
+template class Xaxpy<double>;
+template class Xaxpy<float2>;
+template class Xaxpy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
new file mode 100644
index 00000000..181337b6
--- /dev/null
+++ b/src/routines/level2/xgemv.cc
@@ -0,0 +1,146 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level2/xgemv.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Xgemv"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+                            const T beta,
+                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrix has an alternative layout (row or column-major).
+  auto a_altlayout = (layout == Layout::kRowMajor);
+  auto a_one = (a_altlayout) ? n : m;
+  auto a_two = (a_altlayout) ? m : n;
+
+  // Swap m and n if the matrix is transposed
+  auto a_transposed = (a_transpose != Transpose::kNo);
+  auto m_real = (a_transposed) ? n : m;
+  auto n_real = (a_transposed) ? m : n;
+
+  // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
+  auto a_rotated = a_transposed ^ a_altlayout;
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+
+  // Tests the matrix and the vectors for validity
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines whether or not the fast-version can be used
+  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
+                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
+                         IsMultiple(n, db_["WGS2"]) &&
+                         IsMultiple(a_ld, db_["VW2"]);
+  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
+                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
+                             IsMultiple(n, db_["WGS3"]) &&
+                             IsMultiple(a_ld, db_["VW3"]);
+
+  // If possible, run the fast-version (rotated or non-rotated) of the kernel
+  auto kernel_name = "Xgemv";
+  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
+  auto global_size = m_ceiled / db_["WPT1"];
+  auto local_size = db_["WGS1"];
+  if (use_fast_kernel) {
+    kernel_name = "XgemvFast";
+    global_size = m_real / db_["WPT2"];
+    local_size = db_["WGS2"];
+  }
+  if (use_fast_kernel_rot) {
+    kernel_name = "XgemvFastRot";
+    global_size = m_real / db_["WPT3"];
+    local_size = db_["WGS3"];
+  }
+
+  // Retrieves the Xgemv kernel from the compiled binary
+  try {
+    auto& program = GetProgramFromCache();
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(m_real));
+    kernel.SetArgument(1, static_cast<int>(n_real));
+    kernel.SetArgument(2, alpha);
+    kernel.SetArgument(3, beta);
+    kernel.SetArgument(4, static_cast<int>(a_rotated));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, x_buffer());
+    kernel.SetArgument(9, static_cast<int>(x_offset));
+    kernel.SetArgument(10, static_cast<int>(x_inc));
+    kernel.SetArgument(11, y_buffer());
+    kernel.SetArgument(12, static_cast<int>(y_offset));
+    kernel.SetArgument(13, static_cast<int>(y_inc));
+    kernel.SetArgument(14, static_cast<int>(a_conjugate));
+
+    // Launches the kernel
+    auto global = std::vector<size_t>{global_size};
+    auto local = std::vector<size_t>{local_size};
+    status = RunKernel(kernel, global, local);
+    if (ErrorIn(status)) { return status; }
+
+    // Waits for all kernels to finish
+    queue_.Finish();
+
+    // Succesfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemv<float>;
+template class Xgemv<double>;
+template class Xgemv<float2>;
+template class Xgemv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
new file mode 100644
index 00000000..f4a9f737
--- /dev/null
+++ b/src/routines/level3/xgemm.cc
@@ -0,0 +1,172 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xgemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xgemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
+template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
+template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xgemm<T>::DoGemm(const Layout layout,
+                            const Transpose a_transpose, const Transpose b_transpose,
+                            const size_t m, const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
+  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
+  // col-major) to be transformed, so transposing requirements are not the same as whether or not
+  // the matrix is actually transposed in memory.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+  auto a_do_transpose =  a_rotated;
+  auto b_do_transpose = !b_rotated;
+  auto c_do_transpose =  c_rotated;
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  auto b_conjugate = (b_transpose == Transpose::kConjugate);
+
+  // Computes the first and second dimensions of the 3 matrices taking into account whether the
+  // matrices are rotated or not
+  auto a_one = (a_rotated) ? k : m;
+  auto a_two = (a_rotated) ? m : k;
+  auto b_one = (b_rotated) ? n : k;
+  auto b_two = (b_rotated) ? k : n;
+  auto c_one = (c_rotated) ? n : m;
+  auto c_two = (c_rotated) ? m : n;
+
+  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of m, n, and k
+  auto m_ceiled = Ceil(m, db_["MWG"]);
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
+    // them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
+                                    a_do_transpose, a_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    b_do_transpose, b_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Only necessary for matrix C if it used both as input and output
+    if (beta != static_cast<T>(0)) {
+      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
+                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_do_transpose, false, true, false, false, false, program);
+      if (ErrorIn(status)) { return status; }
+    }
+
+    // Retrieves the Xgemm kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, "Xgemm");
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(m_ceiled));
+      kernel.SetArgument(1, static_cast<int>(n_ceiled));
+      kernel.SetArgument(2, static_cast<int>(k_ceiled));
+      kernel.SetArgument(3, alpha);
+      kernel.SetArgument(4, beta);
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_b());
+      kernel.SetArgument(7, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
+                                      c_one, c_two, c_ld, c_offset, c_buffer,
+                                      c_do_transpose, false, false, false, false, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xgemm<float>;
+template class Xgemm<double>;
+template class Xgemm<float2>;
+template class Xgemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
new file mode 100644
index 00000000..bc257c44
--- /dev/null
+++ b/src/routines/level3/xhemm.cc
@@ -0,0 +1,130 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xhemm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xhemm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
+
+  // Temporary buffer for a copy of the hermitian matrix
+  try {
+    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the hermitian-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_herm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // hermitian-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_herm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_herm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xhemm<float2>;
+template class Xhemm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
new file mode 100644
index 00000000..6d33a0e1
--- /dev/null
+++ b/src/routines/level3/xher2k.cc
@@ -0,0 +1,178 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xher2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xher2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                                const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const U beta,
+                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto ab_conjugate = (ab_transpose != Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
+                    (layout == Layout::kRowMajor && !ab_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a1,
+                                    ab_rotated, ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a2,
+                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b1,
+                                    ab_rotated, ab_conjugate, true, false, false, false, program);
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b2,
+                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, temp_a1());
+      kernel.SetArgument(5, temp_b2());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
+      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
+      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
+      kernel.SetArgument(2, conjugate_alpha);
+      kernel.SetArgument(3, complex_one);
+      kernel.SetArgument(4, temp_b1());
+      kernel.SetArgument(5, temp_a2());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, true, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xher2k<float2,float>;
+template class Xher2k<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
new file mode 100644
index 00000000..8fae294f
--- /dev/null
+++ b/src/routines/level3/xherk.cc
@@ -0,0 +1,156 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xherk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xherk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T, typename U>
+Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T, typename U>
+StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                              const size_t n, const size_t k,
+                              const U alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const U beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
+  // to matrix A (argument: conjugate transpose)
+  auto a_conjugate = (a_transpose != Transpose::kNo);
+  auto b_conjugate = (a_transpose == Transpose::kNo);
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
+                   (layout == Layout::kRowMajor && !a_conjugate);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
+    // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It
+    // creates two copies: 
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    a_rotated, a_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    a_rotated, b_conjugate, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+      auto complex_beta = T{beta, static_cast<U>(0.0)};
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, complex_alpha);
+      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_b());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, true, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xherk<float2,float>;
+template class Xherk<double2,double>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
new file mode 100644
index 00000000..1d17f0eb
--- /dev/null
+++ b/src/routines/level3/xsymm.cc
@@ -0,0 +1,132 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsymm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsymm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
+  // left) or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the squared A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
+
+  // Temporary buffer for a copy of the symmetric matrix
+  try {
+    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the symmetric-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_symm());
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // symmetric-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "C := AB+C" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_symm, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+      }
+
+      // ... with "C := BA+C". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_symm, 0, k,
+                        beta,
+                        c_buffer, c_offset, c_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsymm<float>;
+template class Xsymm<double>;
+template class Xsymm<float2>;
+template class Xsymm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
new file mode 100644
index 00000000..d54f2fc1
--- /dev/null
+++ b/src/routines/level3/xsyr2k.cc
@@ -0,0 +1,166 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyr2k class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyr2k.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                              const size_t n, const size_t k,
+                              const T alpha,
+                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+                              const T beta,
+                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
+                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A and B matrices taking the layout into account
+  auto ab_one = (ab_rotated) ? k : n;
+  auto ab_two = (ab_rotated) ? n : k;
+
+  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
+    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    ab_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
+                                    ab_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_b());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
+      auto one = static_cast<T>(1);
+      kernel.SetArgument(3, one);
+      kernel.SetArgument(4, temp_b());
+      kernel.SetArgument(5, temp_a());
+
+      // Runs the kernel again
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyr2k<float>;
+template class Xsyr2k<double>;
+template class Xsyr2k<float2>;
+template class Xsyr2k<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
new file mode 100644
index 00000000..bb952410
--- /dev/null
+++ b/src/routines/level3/xsyrk.cc
@@ -0,0 +1,147 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xsyrk class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xsyrk.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
+template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
+template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
+    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                            const size_t n, const size_t k,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const T beta,
+                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
+
+  // Computes whether or not the matrices are transposed in memory. This is based on their layout
+  // (row or column-major) and whether or not they are requested to be pre-transposed.
+  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  auto c_rotated = (layout == Layout::kRowMajor);
+
+  // Computes the first and second dimensions of the A matrix taking the layout into account
+  auto a_one = (a_rotated) ? k : n;
+  auto a_two = (a_rotated) ? n : k;
+
+  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
+  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
+  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
+  // space. Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
+  //    matrix C cannot be less than N
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Calculates the ceiled versions of n and k
+  auto n_ceiled = Ceil(n, db_["NWG"]);
+  auto k_ceiled = Ceil(k, db_["KWG"]);
+
+  // Decides which kernel to run: the upper-triangular or lower-triangular version
+  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
+
+  // Allocates space on the device for padded and/or transposed input and output matrices.
+  try {
+    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
+    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+
+    // Loads the program from the database
+    auto& program = GetProgramFromCache();
+
+    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
+    // fill it up until it reaches a certain multiple of size (kernel parameter dependent).
+    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
+                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
+                                    a_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
+    // modify the other triangle.
+    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
+                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                    c_rotated, false, true, false, false, false, program);
+    if (ErrorIn(status)) { return status; }
+
+    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
+    try {
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the kernel arguments
+      kernel.SetArgument(0, static_cast<int>(n_ceiled));
+      kernel.SetArgument(1, static_cast<int>(k_ceiled));
+      kernel.SetArgument(2, alpha);
+      kernel.SetArgument(3, beta);
+      kernel.SetArgument(4, temp_a());
+      kernel.SetArgument(5, temp_a());
+      kernel.SetArgument(6, temp_c());
+
+      // Computes the global and local thread sizes
+      auto global = std::vector<size_t>{
+        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
+        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+      };
+      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+
+      // Launches the kernel
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the post-processing kernel
+      auto upper = (triangle == Triangle::kUpper);
+      auto lower = (triangle == Triangle::kLower);
+      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
+                                      n, n, c_ld, c_offset, c_buffer,
+                                      c_rotated, false, false, upper, lower, false, program);
+      if (ErrorIn(status)) { return status; }
+
+      // Successfully finished the computation
+      return StatusCode::kSuccess;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xsyrk<float>;
+template class Xsyrk<double>;
+template class Xsyrk<float2>;
+template class Xsyrk<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
new file mode 100644
index 00000000..52f272e3
--- /dev/null
+++ b/src/routines/level3/xtrmm.cc
@@ -0,0 +1,135 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrmm class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/level3/xtrmm.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
+    Xgemm<T>(queue, event) {
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t m, const size_t n,
+                            const T alpha,
+                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+
+  // Determines whether or not the triangular matrix is unit-diagonal
+  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
+
+  // Temporary buffer for a copy of the triangular matrix
+  try {
+    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+
+    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
+    // routine afterwards
+    try {
+      auto& program = GetProgramFromCache();
+      auto kernel = Kernel(program, kernel_name);
+
+      // Sets the arguments for the triangular-to-squared kernel
+      kernel.SetArgument(0, static_cast<int>(k));
+      kernel.SetArgument(1, static_cast<int>(a_ld));
+      kernel.SetArgument(2, static_cast<int>(a_offset));
+      kernel.SetArgument(3, a_buffer());
+      kernel.SetArgument(4, static_cast<int>(k));
+      kernel.SetArgument(5, static_cast<int>(k));
+      kernel.SetArgument(6, static_cast<int>(0));
+      kernel.SetArgument(7, temp_triangular());
+      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
+
+      // Uses the common padding kernel's thread configuration. This is allowed, since the
+      // triangular-to-squared kernel uses the same parameters.
+      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
+                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
+      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
+      status = RunKernel(kernel, global, local);
+      if (ErrorIn(status)) { return status; }
+
+      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
+      if (side == Side::kLeft) {
+        status = DoGemm(layout, a_transpose, Transpose::kNo,
+                        m, n, k,
+                        alpha,
+                        temp_triangular, 0, k,
+                        b_buffer, b_offset, b_ld,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+      }
+
+      // ... with "B := alpha*B*A". Note that A and B are now reversed.
+      else {
+        status = DoGemm(layout, Transpose::kNo, a_transpose,
+                        m, n, k,
+                        alpha,
+                        b_buffer, b_offset, b_ld,
+                        temp_triangular, 0, k,
+                        static_cast<T>(0.0),
+                        b_buffer, b_offset, b_ld);
+
+        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
+        switch(status) {
+          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
+          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
+          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
+          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
+          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
+          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
+        }
+      }
+
+      // Return the status of the Xgemm routine
+      return status;
+    } catch (...) { return StatusCode::kInvalidKernel; }
+  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrmm<float>;
+template class Xtrmm<double>;
+template class Xtrmm<float2>;
+template class Xtrmm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/xaxpy.cc b/src/routines/xaxpy.cc
deleted file mode 100644
index b68458da..00000000
--- a/src/routines/xaxpy.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xaxpy class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xaxpy.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xaxpy"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
-                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
-
-  // Makes sure all dimensions are larger than zero
-  if (n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                         (y_offset == 0) && (y_inc == 1) &&
-                         IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
-
-  // If possible, run the fast-version of the kernel
-  auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
-
-  // Retrieves the Xaxpy kernel from the compiled binary
-  try {
-    auto& program = GetProgramFromCache();
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, y_buffer());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
-      kernel.SetArgument(2, x_buffer());
-      kernel.SetArgument(3, static_cast<int>(x_offset));
-      kernel.SetArgument(4, static_cast<int>(x_inc));
-      kernel.SetArgument(5, y_buffer());
-      kernel.SetArgument(6, static_cast<int>(y_offset));
-      kernel.SetArgument(7, static_cast<int>(y_inc));
-    }
-
-    // Launches the kernel
-    if (use_fast_kernel) {
-      auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
-    }
-    else {
-      auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
-      auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
-      auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local);
-    }
-    if (ErrorIn(status)) { return status; }
-
-    // Waits for all kernels to finish
-    queue_.Finish();
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xaxpy<float>;
-template class Xaxpy<double>;
-template class Xaxpy<float2>;
-template class Xaxpy<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc
deleted file mode 100644
index c8674282..00000000
--- a/src/routines/xgemm.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xgemm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xgemm<T>::DoGemm(const Layout layout,
-                            const Transpose a_transpose, const Transpose b_transpose,
-                            const size_t m, const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed. Note
-  // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
-  // col-major) to be transformed, so transposing requirements are not the same as whether or not
-  // the matrix is actually transposed in memory.
-  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
-  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-  auto a_do_transpose =  a_rotated;
-  auto b_do_transpose = !b_rotated;
-  auto c_do_transpose =  c_rotated;
-
-  // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
-  auto b_conjugate = (b_transpose == Transpose::kConjugate);
-
-  // Computes the first and second dimensions of the 3 matrices taking into account whether the
-  // matrices are rotated or not
-  auto a_one = (a_rotated) ? k : m;
-  auto a_two = (a_rotated) ? m : k;
-  auto b_one = (b_rotated) ? n : k;
-  auto b_two = (b_rotated) ? k : n;
-  auto c_one = (c_rotated) ? n : m;
-  auto c_two = (c_rotated) ? m : n;
-
-  // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than K when rotated, or less than M when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N when rotated, or less than M when not-rotated
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of m, n, and k
-  auto m_ceiled = Ceil(m, db_["MWG"]);
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices, but also pads zeros to fill
-    // them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
-                                    a_do_transpose, a_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    b_do_transpose, b_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Only necessary for matrix C if it used both as input and output
-    if (beta != static_cast<T>(0)) {
-      status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_do_transpose, false, true, false, false, false, program);
-      if (ErrorIn(status)) { return status; }
-    }
-
-    // Retrieves the Xgemm kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, "Xgemm");
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(m_ceiled));
-      kernel.SetArgument(1, static_cast<int>(n_ceiled));
-      kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha);
-      kernel.SetArgument(4, beta);
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_b());
-      kernel.SetArgument(7, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
-                                      c_one, c_two, c_ld, c_offset, c_buffer,
-                                      c_do_transpose, false, false, false, false, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xgemm<float>;
-template class Xgemm<double>;
-template class Xgemm<float2>;
-template class Xgemm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xgemv.cc b/src/routines/xgemv.cc
deleted file mode 100644
index 1868dec4..00000000
--- a/src/routines/xgemv.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xgemv class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xgemv.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Xgemv"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
-                            const T beta,
-                            const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
-
-  // Makes sure all dimensions are larger than zero
-  if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrix has an alternative layout (row or column-major).
-  auto a_altlayout = (layout == Layout::kRowMajor);
-  auto a_one = (a_altlayout) ? n : m;
-  auto a_two = (a_altlayout) ? m : n;
-
-  // Swap m and n if the matrix is transposed
-  auto a_transposed = (a_transpose != Transpose::kNo);
-  auto m_real = (a_transposed) ? n : m;
-  auto n_real = (a_transposed) ? m : n;
-
-  // Determines whether the kernel needs to perform rotated access ('^' is the XOR operator)
-  auto a_rotated = a_transposed ^ a_altlayout;
-
-  // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
-
-  // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines whether or not the fast-version can be used
-  bool use_fast_kernel = (a_offset == 0) && (a_rotated == 0) && (a_conjugate == 0) &&
-                         IsMultiple(m, db_["WGS2"]*db_["WPT2"]) &&
-                         IsMultiple(n, db_["WGS2"]) &&
-                         IsMultiple(a_ld, db_["VW2"]);
-  bool use_fast_kernel_rot = (a_offset == 0) && (a_rotated == 1) && (a_conjugate == 0) &&
-                             IsMultiple(m, db_["WGS3"]*db_["WPT3"]) &&
-                             IsMultiple(n, db_["WGS3"]) &&
-                             IsMultiple(a_ld, db_["VW3"]);
-
-  // If possible, run the fast-version (rotated or non-rotated) of the kernel
-  auto kernel_name = "Xgemv";
-  auto m_ceiled = Ceil(m_real, db_["WGS1"]*db_["WPT1"]);
-  auto global_size = m_ceiled / db_["WPT1"];
-  auto local_size = db_["WGS1"];
-  if (use_fast_kernel) {
-    kernel_name = "XgemvFast";
-    global_size = m_real / db_["WPT2"];
-    local_size = db_["WGS2"];
-  }
-  if (use_fast_kernel_rot) {
-    kernel_name = "XgemvFastRot";
-    global_size = m_real / db_["WPT3"];
-    local_size = db_["WGS3"];
-  }
-
-  // Retrieves the Xgemv kernel from the compiled binary
-  try {
-    auto& program = GetProgramFromCache();
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    kernel.SetArgument(0, static_cast<int>(m_real));
-    kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha);
-    kernel.SetArgument(3, beta);
-    kernel.SetArgument(4, static_cast<int>(a_rotated));
-    kernel.SetArgument(5, a_buffer());
-    kernel.SetArgument(6, static_cast<int>(a_offset));
-    kernel.SetArgument(7, static_cast<int>(a_ld));
-    kernel.SetArgument(8, x_buffer());
-    kernel.SetArgument(9, static_cast<int>(x_offset));
-    kernel.SetArgument(10, static_cast<int>(x_inc));
-    kernel.SetArgument(11, y_buffer());
-    kernel.SetArgument(12, static_cast<int>(y_offset));
-    kernel.SetArgument(13, static_cast<int>(y_inc));
-    kernel.SetArgument(14, static_cast<int>(a_conjugate));
-
-    // Launches the kernel
-    auto global = std::vector<size_t>{global_size};
-    auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local);
-    if (ErrorIn(status)) { return status; }
-
-    // Waits for all kernels to finish
-    queue_.Finish();
-
-    // Succesfully finished the computation
-    return StatusCode::kSuccess;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xgemv<float>;
-template class Xgemv<double>;
-template class Xgemv<float2>;
-template class Xgemv<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xhemm.cc b/src/routines/xhemm.cc
deleted file mode 100644
index 73f769ed..00000000
--- a/src/routines/xhemm.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xhemm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xhemm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not the hermitian matrix is A (on the
-  // left) or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the hermitian matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "HermUpperToSquared" : "HermLowerToSquared";
-
-  // Temporary buffer for a copy of the hermitian matrix
-  try {
-    auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the hermitian-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_herm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // hermitian-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_herm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_herm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xhemm<float2>;
-template class Xhemm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xher2k.cc b/src/routines/xher2k.cc
deleted file mode 100644
index b19b743b..00000000
--- a/src/routines/xher2k.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xher2k class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xher2k.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T, typename U>
-Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T, typename U>
-StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                                const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const U beta,
-                                const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
-  // to matrix A (argument: conjugate transpose)
-  auto ab_conjugate = (ab_transpose != Transpose::kNo);
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto ab_rotated = (layout == Layout::kColMajor && ab_conjugate) ||
-                    (layout == Layout::kRowMajor && !ab_conjugate);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A and B matrices taking the layout into account
-  auto ab_one = (ab_rotated) ? k : n;
-  auto ab_two = (ab_rotated) ? n : k;
-
-  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b1 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_a2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b2 = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a1,
-                                    ab_rotated, ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a2,
-                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b1,
-                                    ab_rotated, ab_conjugate, true, false, false, false, program);
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b2,
-                                    ab_rotated, !ab_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, complex_beta);
-      kernel.SetArgument(4, temp_a1());
-      kernel.SetArgument(5, temp_b2());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
-      auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
-      auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      kernel.SetArgument(2, conjugate_alpha);
-      kernel.SetArgument(3, complex_one);
-      kernel.SetArgument(4, temp_b1());
-      kernel.SetArgument(5, temp_a2());
-
-      // Runs the kernel again
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, true, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xher2k<float2,float>;
-template class Xher2k<double2,double>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xherk.cc b/src/routines/xherk.cc
deleted file mode 100644
index 6bc9cd6c..00000000
--- a/src/routines/xherk.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xherk class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xherk.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T, typename U>
-Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T, typename U>
-StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                              const size_t n, const size_t k,
-                              const U alpha,
-                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                              const U beta,
-                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Determines whether to apply the conjugate transpose to matrix B (argument: no transpose) or
-  // to matrix A (argument: conjugate transpose)
-  auto a_conjugate = (a_transpose != Transpose::kNo);
-  auto b_conjugate = (a_transpose == Transpose::kNo);
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto a_rotated = (layout == Layout::kColMajor && a_conjugate) ||
-                   (layout == Layout::kRowMajor && !a_conjugate);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A matrix taking the layout into account
-  auto a_one = (a_rotated) ? k : n;
-  auto a_two = (a_rotated) ? n : k;
-
-  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
-    // fill it up until it reaches a certain multiple of size (kernel parameter dependent). It
-    // creates two copies: 
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    a_rotated, a_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    a_rotated, b_conjugate, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, complex_alpha);
-      kernel.SetArgument(3, complex_beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_b());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, true, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xherk<float2,float>;
-template class Xherk<double2,double>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsymm.cc b/src/routines/xsymm.cc
deleted file mode 100644
index b39eb24d..00000000
--- a/src/routines/xsymm.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsymm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsymm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not the symmetric matrix is A (on the
-  // left) or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the symmetric matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "SymmUpperToSquared" : "SymmLowerToSquared";
-
-  // Temporary buffer for a copy of the symmetric matrix
-  try {
-    auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the symmetric-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_symm());
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // symmetric-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "C := AB+C" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_symm, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-      }
-
-      // ... with "C := BA+C". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_symm, 0, k,
-                        beta,
-                        c_buffer, c_offset, c_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsymm<float>;
-template class Xsymm<double>;
-template class Xsymm<float2>;
-template class Xsymm<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc
deleted file mode 100644
index abb8b7eb..00000000
--- a/src/routines/xsyr2k.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyr2k class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsyr2k.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
-                              const size_t n, const size_t k,
-                              const T alpha,
-                              const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                              const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
-                              const T beta,
-                              const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto ab_rotated = (layout == Layout::kColMajor && ab_transpose != Transpose::kNo) ||
-                    (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A and B matrices taking the layout into account
-  auto ab_one = (ab_rotated) ? k : n;
-  auto ab_two = (ab_rotated) ? n : k;
-
-  // Tests the matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix B cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_b = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
-    // fill them up until they reach a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    ab_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-    status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
-                                    ab_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_b());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Swaps the arguments for matrices A and B, and sets 'beta' to 1
-      auto one = static_cast<T>(1);
-      kernel.SetArgument(3, one);
-      kernel.SetArgument(4, temp_b());
-      kernel.SetArgument(5, temp_a());
-
-      // Runs the kernel again
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsyr2k<float>;
-template class Xsyr2k<double>;
-template class Xsyr2k<float2>;
-template class Xsyr2k<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc
deleted file mode 100644
index 3efa0598..00000000
--- a/src/routines/xsyrk.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xsyrk class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xsyrk.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
-    Routine(queue, event, {"Copy", "Pad", "Transpose", "PadTranspose", "Xgemm"}, precision_) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
-                            const size_t n, const size_t k,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const T beta,
-                            const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
-
-  // Computes whether or not the matrices are transposed in memory. This is based on their layout
-  // (row or column-major) and whether or not they are requested to be pre-transposed.
-  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-
-  // Computes the first and second dimensions of the A matrix taking the layout into account
-  auto a_one = (a_rotated) ? k : n;
-  auto a_two = (a_rotated) ? n : k;
-
-  // Tests the two matrices (A, C) for validity, first from a perspective of the OpenCL buffers and
-  // their sizes, and then from a perspective of parameter values (e.g. n, k). Tests whether the
-  // OpenCL buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage
-  // space. Also tests that the leading dimensions of:
-  //    matrix A cannot be less than N when rotated, or less than K when not-rotated
-  //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Calculates the ceiled versions of n and k
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
-
-  // Decides which kernel to run: the upper-triangular or lower-triangular version
-  auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower";
-
-  // Allocates space on the device for padded and/or transposed input and output matrices.
-  try {
-    auto temp_a = Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
-    auto temp_c = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
-
-    // Loads the program from the database
-    auto& program = GetProgramFromCache();
-
-    // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to
-    // fill it up until it reaches a certain multiple of size (kernel parameter dependent).
-    status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
-                                    n_ceiled, k_ceiled, n_ceiled, 0, temp_a,
-                                    a_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
-    // modify the other triangle.
-    status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer,
-                                    n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                    c_rotated, false, true, false, false, false, program);
-    if (ErrorIn(status)) { return status; }
-
-    // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary
-    try {
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the kernel arguments
-      kernel.SetArgument(0, static_cast<int>(n_ceiled));
-      kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
-      kernel.SetArgument(4, temp_a());
-      kernel.SetArgument(5, temp_a());
-      kernel.SetArgument(6, temp_c());
-
-      // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
-        (n_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
-      };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
-
-      // Launches the kernel
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the post-processing kernel
-      auto upper = (triangle == Triangle::kUpper);
-      auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c,
-                                      n, n, c_ld, c_offset, c_buffer,
-                                      c_rotated, false, false, upper, lower, false, program);
-      if (ErrorIn(status)) { return status; }
-
-      // Successfully finished the computation
-      return StatusCode::kSuccess;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xsyrk<float>;
-template class Xsyrk<double>;
-template class Xsyrk<float2>;
-template class Xsyrk<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routines/xtrmm.cc b/src/routines/xtrmm.cc
deleted file mode 100644
index 543df844..00000000
--- a/src/routines/xtrmm.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Xtrmm class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include "internal/routines/xtrmm.h"
-
-#include <string>
-#include <vector>
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: forwards to base class constructor
-template <typename T>
-Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
-    Xgemm<T>(queue, event) {
-}
-
-// =================================================================================================
-
-// The main routine
-template <typename T>
-StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
-                            const Transpose a_transpose, const Diagonal diagonal,
-                            const size_t m, const size_t n,
-                            const T alpha,
-                            const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
-                            const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
-
-  // Makes sure all dimensions are larger than zero
-  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
-
-  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
-  // or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
-
-  // Checks for validity of the triangular A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
-  if (ErrorIn(status)) { return status; }
-
-  // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
-  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
-  bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                   (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
-
-  // Determines whether or not the triangular matrix is unit-diagonal
-  auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
-
-  // Temporary buffer for a copy of the triangular matrix
-  try {
-    auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
-
-    // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
-    // routine afterwards
-    try {
-      auto& program = GetProgramFromCache();
-      auto kernel = Kernel(program, kernel_name);
-
-      // Sets the arguments for the triangular-to-squared kernel
-      kernel.SetArgument(0, static_cast<int>(k));
-      kernel.SetArgument(1, static_cast<int>(a_ld));
-      kernel.SetArgument(2, static_cast<int>(a_offset));
-      kernel.SetArgument(3, a_buffer());
-      kernel.SetArgument(4, static_cast<int>(k));
-      kernel.SetArgument(5, static_cast<int>(k));
-      kernel.SetArgument(6, static_cast<int>(0));
-      kernel.SetArgument(7, temp_triangular());
-      kernel.SetArgument(8, static_cast<int>(unit_diagonal));
-
-      // Uses the common padding kernel's thread configuration. This is allowed, since the
-      // triangular-to-squared kernel uses the same parameters.
-      auto global = std::vector<size_t>{Ceil(CeilDiv(k, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                        Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-      auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-      status = RunKernel(kernel, global, local);
-      if (ErrorIn(status)) { return status; }
-
-      // Runs the regular Xgemm code with either "B := alpha*A*B" or ...
-      if (side == Side::kLeft) {
-        status = DoGemm(layout, a_transpose, Transpose::kNo,
-                        m, n, k,
-                        alpha,
-                        temp_triangular, 0, k,
-                        b_buffer, b_offset, b_ld,
-                        static_cast<T>(0.0),
-                        b_buffer, b_offset, b_ld);
-      }
-
-      // ... with "B := alpha*B*A". Note that A and B are now reversed.
-      else {
-        status = DoGemm(layout, Transpose::kNo, a_transpose,
-                        m, n, k,
-                        alpha,
-                        b_buffer, b_offset, b_ld,
-                        temp_triangular, 0, k,
-                        static_cast<T>(0.0),
-                        b_buffer, b_offset, b_ld);
-
-        // A and B are now reversed, so also reverse the error codes returned from the Xgemm routine
-        switch(status) {
-          case StatusCode::kInvalidMatrixA:      status = StatusCode::kInvalidMatrixB; break;
-          case StatusCode::kInvalidMatrixB:      status = StatusCode::kInvalidMatrixA; break;
-          case StatusCode::kInvalidLeadDimA:     status = StatusCode::kInvalidLeadDimB; break;
-          case StatusCode::kInvalidLeadDimB:     status = StatusCode::kInvalidLeadDimA; break;
-          case StatusCode::kInsufficientMemoryA: status = StatusCode::kInsufficientMemoryB; break;
-          case StatusCode::kInsufficientMemoryB: status = StatusCode::kInsufficientMemoryA; break;
-        }
-      }
-
-      // Return the status of the Xgemm routine
-      return status;
-    } catch (...) { return StatusCode::kInvalidKernel; }
-  } catch (...) { return StatusCode::kTempBufferAllocFailure; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Xtrmm<float>;
-template class Xtrmm<double>;
-template class Xtrmm<float2>;
-template class Xtrmm<double2>;
-
-// =================================================================================================
-} // namespace clblast
-- 
cgit v1.2.3