18 files changed, 1592 insertions, 24 deletions
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 35f3f552..20ce1ba4 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -45,6 +45,7 @@
 #include "routines/level2/xtrmv.hpp"
 #include "routines/level2/xtbmv.hpp"
 #include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
 #include "routines/level2/xger.hpp"
 #include "routines/level2/xgeru.hpp"
 #include "routines/level2/xgerc.hpp"
@@ -66,6 +67,7 @@
 #include "routines/level3/xsyr2k.hpp"
 #include "routines/level3/xher2k.hpp"
 #include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"
 
 // Level-x includes (non-BLAS)
 #include "routines/levelx/xomatcopy.hpp"
@@ -1145,12 +1147,20 @@ template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Tr
 
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
-StatusCode Trsv(const Layout, const Triangle, const Transpose, const Diagonal,
-                const size_t,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsv<T>(queue_cpp, event);
+    routine.DoTrsv(layout, triangle, a_transpose, diagonal,
+                   n,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(x_buffer), x_offset, x_inc);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsv<float>(const Layout, const Triangle, const Transpose, const Diagonal,
                                            const size_t,
@@ -2067,13 +2077,22 @@ template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triang
 
 // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
 template <typename T>
-StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
-                const size_t, const size_t,
-                const T,
-                const cl_mem, const size_t, const size_t,
-                cl_mem, const size_t, const size_t,
-                cl_command_queue*, cl_event*) {
-  return StatusCode::kNotImplemented;
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event) {
+  try {
+    auto queue_cpp = Queue(*queue);
+    auto routine = Xtrsm<T>(queue_cpp, event);
+    routine.DoTrsm(layout, side, triangle, a_transpose, diagonal,
+                   m, n,
+                   alpha,
+                   Buffer<T>(a_buffer), a_offset, a_ld,
+                   Buffer<T>(b_buffer), b_offset, b_ld);
+    return StatusCode::kSuccess;
+  } catch (...) { return DispatchException(); }
 }
 template StatusCode PUBLIC_API Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
                                            const size_t, const size_t,
diff --git a/src/database/database.cpp b/src/database/database.cpp
index c1cb9d56..aff6490d 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -20,12 +20,14 @@
 #include "database/kernels/xgemv_fast.hpp"
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
+#include "database/kernels/xtrsv.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernels/invert.hpp"
 #include "database/kernel_selection.hpp"
 
 namespace clblast {
@@ -39,12 +41,14 @@ const std::vector<const Database::DatabaseEntry*> Database::database = {
   &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
   &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
   &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+  &database::XtrsvHalf, &database::XtrsvSingle, &database::XtrsvDouble, &database::XtrsvComplexSingle, &database::XtrsvComplexDouble,
   &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
   &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
   &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
   &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
   &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
   &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+  &database::InvertHalf, &database::InvertSingle, &database::InvertDouble, &database::InvertComplexSingle, &database::InvertComplexDouble,
   &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
 };
 
diff --git a/src/database/kernels/invert.hpp b/src/database/kernels/invert.hpp
new file mode 100644
index 00000000..2717f182
--- /dev/null
+++ b/src/database/kernels/invert.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// Tuning parameters for the diagonal matrix inversion kernels
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry InvertHalf = {
+  "Invert", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertSingle = {
+  "Invert", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexSingle = {
+  "Invert", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertDouble = {
+  "Invert", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry InvertComplexDouble = {
+  "Invert", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"INTERNAL_BLOCK_SIZE",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/xtrsv.hpp b/src/database/kernels/xtrsv.hpp
new file mode 100644
index 00000000..0741569e
--- /dev/null
+++ b/src/database/kernels/xtrsv.hpp
@@ -0,0 +1,78 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file populates the database with best-found tuning parameters for the 'Xtrsv' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvHalf = {
+  "Xtrsv", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvSingle = {
+  "Xtrsv", Precision::kSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexSingle = {
+  "Xtrsv", Precision::kComplexSingle, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvDouble = {
+  "Xtrsv", Precision::kDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XtrsvComplexDouble = {
+  "Xtrsv", Precision::kComplexDouble, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRSV_BLOCK_SIZE",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index c7743f90..0ce4f367 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -160,6 +160,13 @@ R"(
   #define AbsoluteValue(value) value = fabs(value)
 #endif
 
+// Negation (component-wise)
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Negate(value) value.x = -(value.x); value.y = -(value.y)
+#else
+  #define Negate(value) value = -(value)
+#endif
+
 // Adds two complex variables
 #if PRECISION == 3232 || PRECISION == 6464
   #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
@@ -167,6 +174,13 @@ R"(
   #define Add(c, a, b) c = a + b
 #endif
 
+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+  #define Subtract(c, a, b) c = a - b
+#endif
+
 // Multiply two complex variables (used in the defines below)
 #if PRECISION == 3232 || PRECISION == 6464
   #define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -191,6 +205,27 @@ R"(
   #endif
 #endif
 
+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+  #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
+// The scalar division function: real-value only
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.y
+#else
+  #define DivideReal(c, a, b) c = a / b
+#endif
+
+// The scalar division function: full division
+#if PRECISION == 3232 || PRECISION == 6464
+  #define DivideFull(c, a, b) singlereal num_x = (a.x * b.x) + (a.y * b.y); singlereal num_y = (a.y * b.x) - (a.x * b.y); singlereal denom = (b.x * b.x) + (b.y * b.y); c.x = num_x / denom; c.y = num_y / denom
+#else
+  #define DivideFull(c, a, b) c = a / b
+#endif
+
 // The scalar AXPBY function
 #if PRECISION == 3232 || PRECISION == 6464
   #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..ebea77a3
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,144 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int tid = get_global_id(0);
+  if (tid < n) {
+    dest[tid*inc + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+
+#ifndef TRSV_BLOCK_SIZE
+  #define TRSV_BLOCK_SIZE 32    // The block size for forward or backward substition
+#endif
+
+// =================================================================================================
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_forward(int n,
+                  const __global real *A, const int a_offset, int a_ld,
+                  __global real *b, const int b_offset, int b_inc,
+                  __global real *x, const int x_offset, int x_inc,
+                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < i; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
+void trsv_backward(int n,
+                   const __global real *A, const int a_offset, int a_ld,
+                   __global real *b, const int b_offset, int b_inc,
+                   __global real *x, const int x_offset, int x_inc,
+                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
+  const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
+  if (tid < n) {
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
+    if (do_conjugate) {
+      for (int i = 0; i < n; ++i) {
+        COMPLEX_CONJUGATE(alm[i][tid]);
+      }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = n - 1; i >= 0; --i) {
+      for (int j = i + 1; j < n; ++j) {
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
+      }
+      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
+  if (tid < n) {
+    x[tid*x_inc + x_offset] = xlm[tid];
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
new file mode 100644
index 00000000..e94b4d30
--- /dev/null
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -0,0 +1,427 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to invert squared diagonal blocks of a matrix. These kernels are based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+//
+//  Let A be an block_size*block_size lower triangular matrix, and B its inverse.
+//  Then the block decomposition
+//  
+//      [ A11   0  ] * [ B11   0  ] = [ I 0 ]
+//      [ A21  A22 ]   [ B21  B22 ]   [ 0 I ]
+//  
+//  yields
+//  
+//      A11*B11 = I            ==>  B11 =  A11^{-1},
+//      A22*B22 = I            ==>  B22 =  A22^{-1},
+//      A21*B11 + A22*B21 = 0  ==>  B21 = -A22^{-1}*A21*B11 = -B22*A21*B11.
+//  
+//  The InvertDiagonalBlock kernel inverts A11 and A22.
+//  The TripleMatMul routines multiply:
+//  part 1:  B21 =  A21 * B11,
+//  part 2:  B21 = -B22 * B21.
+//  
+//  At this level, inner block is current_size=16, with one 4 x 4 work-group per inner block. Each
+//  submatrix Aij and Bij is current_size x current_size. The submatrix dimension is multiplied by 2
+//  at each level, so the next level is current_size*2 = 32. A 'page' is the next bigger block,
+//  here current_size*2=32,
+//                 [ B11   0  ]
+//  which contains [ B21  B22 ].
+//  Outer blocks are block_size x block_size.
+//  
+//  A21 may have < current_size rows, but is guaranteed to have current_size cols since A22 is on
+//  the right. This makes a single check easy to do.
+//  
+//  B is stored in workspace that is a full multiple of block_size x block_size; no checks needed.
+//  
+//  We split this into part1 & part2 to synchronize all blocks and make sure
+//  that writes to B12 are observed by all blocks.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_INVERT)
+
+#define LOCALX 17 // 16 + 1 to avoid bank conflicts
+#define LOCALY 16
+
+// =================================================================================================
+
+// Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
+__kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
+void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+                         __global real* restrict dest, const int outer_block_size,
+                         const int unit_diagonal, const int is_upper)
+{
+  const int thread_index = get_local_id(0);
+  const int block_index = get_group_id(0);
+
+  // Sets the offset for this particular block in the source and destination matrices
+  const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
+  const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
+  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
+                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+
+  // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
+  __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
+
+  // Loads the source lower triangle into local memory. Any values in the upper triangle or
+  // outside of the matrix are set to zero
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    const bool condition = (is_upper) ? (thread_index <= j && block_index*INTERNAL_BLOCK_SIZE + j < n) :
+                                        (thread_index >= j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    if (condition) {
+      lm[thread_index][j] = src[j*src_ld + thread_index + src_block_offset];
+    }
+    else {
+      SetToZero(lm[thread_index][j]);
+    }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  
+  // Inverts the diagonal
+  real inverted_diagonal;
+  SetToOne(inverted_diagonal);
+  if (unit_diagonal == 0) {
+    const real diagonal_value = lm[thread_index][thread_index];
+    if (!IsZero(diagonal_value)) { // Only for non-singular values and values inside the matrix
+      DivideReal(inverted_diagonal, inverted_diagonal, diagonal_value);
+    }
+  }
+  lm[thread_index][thread_index] = inverted_diagonal;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Upper-triangular
+  if (is_upper) {
+
+    // Computes the elements 0:j-1 of the j-th column
+    for (int j = 1; j < INTERNAL_BLOCK_SIZE; ++j) {
+      if (thread_index < j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = 0; k < j; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        real diagonal_value = lm[j][j];
+        Negate(diagonal_value);
+        Multiply(lm[thread_index][j], diagonal_value, sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+
+  // Lower triangular
+  else {
+
+    // Computes the elements j+1:INTERNAL_BLOCK_SIZE-1 of the j-th column
+    for (int j = INTERNAL_BLOCK_SIZE - 2; j >= 0; --j) {
+      if (thread_index > j) {
+        real sum;
+        SetToZero(sum);
+        #pragma unroll
+        for (int k = j + 1; k < INTERNAL_BLOCK_SIZE; ++k) {
+          MultiplyAdd(sum, lm[thread_index][k], lm[k][j]);
+        }
+        Multiply(lm[thread_index][j], -lm[j][j], sum);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+  }
+  
+  // Writes the result to global memory
+  #pragma unroll
+  for (int j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
+    dest[j*outer_block_size + thread_index + dest_block_offset] = lm[thread_index][j];
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel: C = A * B
+inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                         __global const real* agm, __global const real* bgm, __global real* cgm,
+                         const int lda, const int ldb, const int ldc,
+                         int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int by   = get_group_id(1) / num_pages;
+  const int page = get_group_id(1) % num_pages;
+  const int lidx = get_local_id(0);
+  const int lidy = get_local_id(1);
+  const int ibx  = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+  const int iby  = by*16;
+  const int id   = lidx + lidy*get_local_size(0);
+  const int row  = page*current_size*2 + current_size + ibx + id;
+  int col        = page*current_size*2 + current_size;
+
+  // Sets the offsets for this specific thread
+  agm += ibx + id;
+  bgm += lidx + (iby + lidy)*ldb;
+  cgm += ibx + id + iby*ldc;
+
+  // Initializes the result registers
+  real cpm[16];
+  #pragma unroll
+  for (int j = 0; j < 16; ++j) {
+    SetToZero(cpm[j]);
+  }
+
+  // Computes NT x 16 block of C, each thread computes one 1 x 16 row
+  for (int k = 0; k < current_size; k += 16) {
+
+    // Loads a 16 x 16 block of B into local memory using NX x 4 threads
+    #pragma unroll
+    for( int i=0; i < 16; i += (size/4) ) {  // += get_local_size(0)
+      #pragma unroll
+      for( int j=0; j < 16; j += 4 ) {  // += get_local_size(1)
+        blm[(lidx + i) * LOCALX + (lidy + j)] = bgm[k + i + j*ldb];
+      }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Upper triangular
+    if (upper) {
+
+      // Performs 16 x 16 multiply-add operations
+      #pragma unroll
+      for (int i = 0; i < 16; ++i) {
+        if (part == 2 || col++ < n) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    // Lower triangular
+    else {
+      if (row < n) {
+
+        // Performs 16 x 16 multiply-add operations
+        #pragma unroll
+        for (int i = 0; i < 16; ++i) {
+          #pragma unroll
+          for (int j = 0; j < 16; ++j) {
+            MultiplyAdd(cpm[j], agm[(i + k) * lda], blm[i * LOCALX + j]);
+          }
+        }
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  // Stores NT x 16 results: each thread writes one 16 x 1 row
+  #pragma unroll
+  for (int i = 0; i < 16; ++i) {
+    if (part == 2) { Negate(cpm[i]); }
+    cgm[0] = cpm[i];
+    cgm += ldc;
+  }
+}
+
+// =================================================================================================
+
+// Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
+inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                              __global const real* src, const int a_offset, const int lda,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = A12 * B22
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size*lda;  // A12
+    bgm = dest + current_size*block_size + current_size;                                      // B22
+    cgm = dest + current_size*block_size;                                                     // B12
+  }
+  else { // lower triangular: B21 = A21 * B11
+    agm = src + a_offset + page*current_size*2*lda + page*current_size*2 + current_size;  // A21
+    bgm = dest;                                                                           // B11
+    cgm = dest + current_size;                                                            // B21
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 1, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
+inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                              __global real* dest, int current_size, int num_pages, const int block_size) {
+
+  // Emulates a 3D grid: NX * (NY * num_pages)
+  const int page = get_group_id(1) % num_pages;
+
+  // Computes the destination block offset:
+  // - go to the (page / pages_per_block) outer block_size * block_size block
+  // - then the (page % pages_per_block) inner (current_size*2) * (current_size*2) page inside that
+  const int pages_per_block = block_size / (current_size*2);
+  dest += (page / pages_per_block) * block_size * block_size +
+          (page % pages_per_block) * (current_size*2*block_size + current_size*2);
+
+  // Using the GEMM notation: C = A*B
+  __global const real* agm;
+  __global const real* bgm;
+  __global real* cgm;
+  if (upper) { // upper triangular: B12 = -B11 * B12
+    agm = dest;                            // B11
+    cgm = dest + current_size*block_size;  // B12
+    bgm = cgm;                             // B12, okay to overwrite
+  }
+
+  else { // lower triangular: B21 = -B22 * B21
+    agm = dest + current_size*block_size + current_size;  // B22
+    cgm = dest + current_size;                            // B21
+    bgm = cgm;                                            // B21, okay to overwrite
+  }
+
+  // Runs the generic C = A * B matrix multiplication
+  const int lda = block_size;
+  const int ldb = block_size;
+  const int ldc = block_size;
+  TripleMatMul(size, upper, 2, blm, n, agm, bgm, cgm, lda, ldb, ldc, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B21 = A21 * B11
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, false, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B21 = -B22 * B21
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, false, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// =================================================================================================
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(16, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(16, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(32, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(32, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+// B12 =  A12 * B22
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
+                              __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart1(64, true, lm, n, src, a_offset, lda, dest, current_size, num_pages, block_size);
+}
+
+// B12 = -B11 * B12
+__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
+{
+  __local real lm[LOCALY * LOCALX];
+  TripleMatMulPart2(64, true, lm, n, dest, current_size, num_pages, block_size);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index bf14ab12..0f5a8607 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -74,6 +74,22 @@ R"(
 #endif
 
 // =================================================================================================
+#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < ld && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+#endif
+
+// =================================================================================================
 
 // End of the C++11 raw string literal
 )"
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 7c211c0d..bdea0086 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -33,6 +33,46 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 
 // =================================================================================================
 
+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(ld));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// =================================================================================================
+
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 9e9c2db4..7d2e5f60 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,9 +22,10 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
+    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
+    #include "../../kernels/level2/xtrsv.opencl"
     }) {
 }
 
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
new file mode 100644
index 00000000..d5d009ff
--- /dev/null
+++ b/src/routines/level2/xtrsv.cpp
@@ -0,0 +1,161 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/level2/xtrsv.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
+    Xgemv<T>(queue, event, name) {
+}
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); };
+
+  // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+  const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+  const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0;
+
+  // The data is either in the upper or lower triangle
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+  // Retrieves the kernel from the compiled binary
+  const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+  auto kernel = Kernel(program_, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, a_buffer());
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, static_cast<int>(a_ld));
+  kernel.SetArgument(4, b_buffer());
+  kernel.SetArgument(5, static_cast<int>(b_offset));
+  kernel.SetArgument(6, static_cast<int>(b_inc));
+  kernel.SetArgument(7, x_buffer());
+  kernel.SetArgument(8, static_cast<int>(x_offset));
+  kernel.SetArgument(9, static_cast<int>(x_inc));
+  kernel.SetArgument(10, static_cast<int>(is_transposed));
+  kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+  kernel.SetArgument(12, static_cast<int>(do_conjugate));
+
+  // Launches the kernel
+  const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]};
+  const auto global = std::vector<size_t>{1};
+  auto event = Event();
+  RunKernel(kernel, queue_, device_, global, local, event.pointer());
+  event.WaitForCompletion();
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t n,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
+
+  // Makes sure all dimensions are larger than zero
+  if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Tests the matrix and vector
+  TestMatrixA(n, n, a_buffer, a_offset, a_ld);
+  TestVectorX(n, b_buffer, b_offset, b_inc);
+
+  // Creates a copy of B to avoid overwriting input while computing output
+  // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+  const auto x_offset = b_offset;
+  const auto x_inc = b_inc;
+  const auto x_size = n*x_inc + x_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_vector_event = Event();
+  FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  fill_vector_event.WaitForCompletion();
+
+  // Derives properties based on the arguments
+  const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                         (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+  const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
+                              (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
+
+  // Loops over the blocks
+  auto col = n; // the initial column position
+  for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) {
+    const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i);
+
+    // Sets the next column position
+    col = (is_upper) ? col - block_size : i;
+
+    // Sets the offsets for upper or lower triangular
+    const auto extra_offset_a = (is_transposed) ?
+                                (is_upper ? col + (col+block_size)*a_ld : col) :
+                                (is_upper ? col+block_size + col*a_ld : col*a_ld);
+    const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0;
+    const auto extra_offset_b = col*x_inc;
+
+    // Runs the GEMV routine to compute x' = A * x
+    if (i > 0) {
+      const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i;
+      const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size;
+      DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(),
+             a_buffer, a_offset + extra_offset_a, a_ld,
+             x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+             x_buffer, x_offset + extra_offset_b, x_inc );
+    }
+
+    // Runs the triangular substitution for the block size
+    Substitution(layout, triangle, a_transpose, diagonal, block_size,
+                 a_buffer, a_offset + col + col*a_ld, a_ld,
+                 b_buffer, b_offset + col*b_inc, b_inc,
+                 x_buffer, x_offset + col*x_inc, x_inc);
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, x_size, b_buffer);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsv<half>;
+template class Xtrsv<float>;
+template class Xtrsv<double>;
+template class Xtrsv<float2>;
+template class Xtrsv<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
new file mode 100644
index 00000000..67e626a1
--- /dev/null
+++ b/src/routines/level2/xtrsv.hpp
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSV_H_
+#define CLBLAST_ROUTINES_XTRSV_H_
+
+#include "routines/level2/xgemv.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsv: public Xgemv<T> {
+ public:
+
+  // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
+  using Xgemv<T>::device_;
+  using Xgemv<T>::db_;
+  using Xgemv<T>::program_;
+  using Xgemv<T>::DoGemv;
+
+  // Constructor
+  Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
+
+  // Templated-precision implementation of the routine
+  void DoTrsv(const Layout layout, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t n,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+  // Performs forward or backward substitution on a small triangular matrix
+  void Substitution(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSV_H_
+#endif
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
new file mode 100644
index 00000000..3a910261
--- /dev/null
+++ b/src/routines/level3/xtrsm.cpp
@@ -0,0 +1,202 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+
+#include "routines/level3/xtrsm.hpp"
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
+    Xgemm<T>(queue, event, name) {
+}
+
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+                      const Transpose a_transpose, const Diagonal diagonal,
+                      const size_t m, const size_t n,
+                      const T alpha,
+                      const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Settings
+  constexpr auto block_size = size_t{32}; // tuneable
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
+
+  // Computes the k dimension. This is based on whether or not matrix is A (on the left)
+  // or B (on the right) in the Xgemm routine.
+  const auto k = (side == Side::kLeft) ? m : n;
+
+  // Checks for validity of the triangular A matrix
+  TestMatrixA(k, k, a_buffer, a_offset, a_ld);
+
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Checks for validity of the input B matrix
+  const auto b_one = (layout == Layout::kRowMajor) ? n : m;
+  const auto b_two = (layout == Layout::kRowMajor) ? m : n;
+  TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+
+  // Creates a copy of B to avoid overwriting input in GEMM while computing output
+  const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
+  const auto x_one = b_one;
+  const auto x_size = b_size;
+  const auto x_ld = b_ld;
+  const auto x_offset = b_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Temporary buffer for the inverse of the A matrix
+  const auto a_inv_size = Ceil(k, block_size) * block_size;
+  auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList,
+             x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
+  fill_matrix_event.WaitForCompletion();
+
+  // Inverts the diagonal blocks
+  auto diagonal_invert_event = Event();
+  auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
+  inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+                                      k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
+  diagonal_invert_event.WaitForCompletion();
+
+  // Lower of upper triangular
+  const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+
+  // Left side
+  if (side == Side::kLeft) {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      for (auto i = size_t{0}; i < m; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(m - i, block_size);
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               current_block_size, n, current_block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i + block_size >= m) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               m - i - block_size, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, i + block_size, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
+      const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               block_size, n, block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               i, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+  }
+
+  // Right side
+  else {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
+      const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, block_size, block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, i, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      for (auto i = size_t{0}; i < n; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(n - i, block_size);
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, current_block_size, current_block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i + block_size >= n) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, n - i - block_size, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, (i + block_size) * b_ld, b_ld);
+      }
+    }
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, b_size, b_buffer);
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xtrsm<half>;
+template class Xtrsm<float>;
+template class Xtrsm<double>;
+template class Xtrsm<float2>;
+template class Xtrsm<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
new file mode 100644
index 00000000..b9d5432a
--- /dev/null
+++ b/src/routines/level3/xtrsm.hpp
@@ -0,0 +1,52 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xtrsm routine. The implementation is based on ??? (TODO).
+// Therefore, this class inherits from the Xgemm class.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XTRSM_H_
+#define CLBLAST_ROUTINES_XTRSM_H_
+
+#include "routines/level3/xgemm.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xtrsm: public Xgemm<T> {
+ public:
+
+  // Uses methods and variables the Xgemm routine
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
+  using Xgemm<T>::program_;
+  using Xgemm<T>::DoGemm;
+
+  // Constructor
+  Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
+
+  // Templated-precision implementation of the routine
+  void DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+              const Transpose a_transpose, const Diagonal diagonal,
+              const size_t m, const size_t n,
+              const T alpha,
+              const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+              const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XTRSM_H_
+#endif
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..696e694a
--- /dev/null
+++ b/src/routines/levelx/xinvert.cpp
@@ -0,0 +1,151 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+#include <assert.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// Inverts diagonal square blocks of a matrix
+template <typename T>
+void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                            const size_t n, const size_t block_size,
+                                            const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                            Buffer<T> &dest) {
+
+  // Makes sure all dimensions are larger than zero and the block size is smaller than n
+  if ((block_size == 0) || (n == 0) || (block_size > n)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Helper variables
+  const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
+  assert(internal_block_size == 16);
+  const auto num_blocks = CeilDiv(n, block_size);
+  const auto num_internal_blocks = CeilDiv(n, internal_block_size);
+  const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
+
+  // This routine only supports block sizes which are a multiple of the internal block size and
+  // block sizes up to and including 128
+  if ((block_size % internal_block_size != 0) || (block_size > 128)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Checks for validity of the source and destination matrices
+  TestMatrixA(n, n, src, offset, ld_src);
+  TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
+
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto name_postfix = (is_upper) ? "Upper" : "Lower";
+
+  // Fills the output buffer with zeros
+  auto event_wait_list = std::vector<Event>();
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list,
+             num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+  event_wait_list.push_back(fill_matrix_event);
+
+  // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
+  auto kernel = Kernel(program_, "InvertDiagonalBlock");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, src());
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, static_cast<int>(ld_src));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, static_cast<int>(block_size));
+  kernel.SetArgument(6, static_cast<int>(unit_diagonal));
+  kernel.SetArgument(7, static_cast<int>(is_upper));
+  const auto local = std::vector<size_t>{internal_block_size};
+  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
+  auto base_kernel_event = Event();
+  auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+  if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
+
+  // Builds up block_size x block_size blocks. For example, internal_block_size=16:
+  // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
+  // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
+  // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
+  for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
+    assert(current_size == 16 || current_size == 32 || current_size == 64);
+
+    // Emulates a 3D grid: NX * (NY * npages)
+    const auto npages = CeilDiv(n, current_size*2);
+    const auto local0 = (current_size <= 32) ? current_size/4 : 16;
+    const auto local = std::vector<size_t>{local0, 4};
+    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+
+    // Part 1
+    auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, src());
+    kernel1.SetArgument(2, static_cast<int>(offset));
+    kernel1.SetArgument(3, static_cast<int>(ld_src));
+    kernel1.SetArgument(4, dest());
+    kernel1.SetArgument(5, static_cast<int>(current_size));
+    kernel1.SetArgument(6, static_cast<int>(npages));
+    kernel1.SetArgument(7, static_cast<int>(block_size));
+    auto kernel1_event = Event();
+    RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
+    event_wait_list.push_back(kernel1_event);
+
+    // Part 2
+    const bool is_last_kernel = (current_size * 2 >= block_size);
+    auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
+    kernel2.SetArgument(0, static_cast<int>(n));
+    kernel2.SetArgument(1, dest());
+    kernel2.SetArgument(2, static_cast<int>(current_size));
+    kernel2.SetArgument(3, static_cast<int>(npages));
+    kernel2.SetArgument(4, static_cast<int>(block_size));
+    auto kernel2_event = Event();
+    auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
+    if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
+
+    // Exit in case we reach beyond the bounds of the input matrix
+    if (current_size*2 >= n) { break; }
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xinvert<half>;
+template class Xinvert<float>;
+template class Xinvert<double>;
+template class Xinvert<float2>;
+template class Xinvert<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..fa0a80e7
--- /dev/null
+++ b/src/routines/levelx/xinvert.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XINVERT_H_
+#define CLBLAST_ROUTINES_XINVERT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+class Xinvert: public Routine {
+ public:
+
+  // Constructor
+  Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
+
+  // Inverts diagonal square blocks of a matrix
+  void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                  const size_t n, const size_t block_size,
+                                  const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                  Buffer<T> &dest);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XINVERT_H_
+#endif
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 5e445bb9..fd198c0d 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -46,6 +46,30 @@ double2 GetScalar() {
   return {2.0, 0.5};
 }
 
+// Returns a scalar of value 0
+template <typename T>
+T ConstantZero() {
+  return static_cast<T>(0.0);
+}
+template float ConstantZero<float>();
+template double ConstantZero<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantZero() {
+  return FloatToHalf(0.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantZero() {
+  return {0.0f, 0.0f};
+}
+template <>
+double2 ConstantZero() {
+  return {0.0, 0.0};
+}
+
 // Returns a scalar of value 1
 template <typename T>
 T ConstantOne() {
@@ -70,6 +94,30 @@ double2 ConstantOne() {
   return {1.0, 0.0};
 }
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne() {
+  return static_cast<T>(-1.0);
+}
+template float ConstantNegOne<float>();
+template double ConstantNegOne<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantNegOne() {
+  return FloatToHalf(-1.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantNegOne() {
+  return {-1.0f, 0.0f};
+}
+template <>
+double2 ConstantNegOne() {
+  return {-1.0, 0.0};
+}
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
@@ -79,23 +127,27 @@ std::string ToString(T value) {
 }
 template std::string ToString<int>(int value);
 template std::string ToString<size_t>(size_t value);
-template std::string ToString<float>(float value);
-template std::string ToString<double>(double value);
+template <>
+std::string ToString(float value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
+template <>
+std::string ToString(double value) {
+  std::ostringstream result;
+  result << std::fixed << std::setprecision(2) << value;
+  return result.str();
+}
 
 // If not possible directly: special cases for complex data-types
 template <>
 std::string ToString(float2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 template <>
 std::string ToString(double2 value) {
-  std::ostringstream real, imag;
-  real << std::setprecision(2) << value.real();
-  imag << std::setprecision(2) << value.imag();
-  return real.str()+"+"+imag.str()+"i";
+  return ToString(value.real())+"+"+ToString(value.imag())+"i";
 }
 
 // If not possible directly: special case for half-precision
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index a1d4e2db..757f1b5e 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -102,10 +102,18 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 template <typename T>
 T GetScalar();
 
+// Returns a scalar of value 0
+template <typename T>
+T ConstantZero();
+
 // Returns a scalar of value 1
 template <typename T>
 T ConstantOne();
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne();
+
 // =================================================================================================
 
 // Structure containing all possible arguments for test clients, including their default values