7 files changed, 192 insertions, 28 deletions
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 9231d725..e94b4d30 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -61,19 +61,6 @@ R"(
 
 // =================================================================================================
 
-__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
-void FillMatrix(const int n, const int ld, const int offset,
-                __global real* restrict dest, const real_arg arg_value) {
-  const real value = GetRealArg(arg_value);
-  const int id_one = get_global_id(0);
-  const int id_two = get_global_id(1);
-  if (id_one < ld && id_two < n) {
-    dest[id_two*ld + id_one + offset] = value;
-  }
-}
-
-// =================================================================================================
-
 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
 __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
 void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl
index bf14ab12..0f5a8607 100644
--- a/src/kernels/level3/level3.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -74,6 +74,22 @@ R"(
 #endif
 
 // =================================================================================================
+#if defined(ROUTINE_INVERT) || defined(ROUTINE_TRSM)
+
+__kernel __attribute__((reqd_work_group_size(8, 8, 1)))
+void FillMatrix(const int n, const int ld, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int id_one = get_global_id(0);
+  const int id_two = get_global_id(1);
+  if (id_one < ld && id_two < n) {
+    dest[id_two*ld + id_one + offset] = value;
+  }
+}
+
+#endif
+
+// =================================================================================================
 
 // End of the C++11 raw string literal
 )"
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 0ac1a58e..8061b508 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -7,11 +7,15 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xtrsm class (see the header for information about the class).
+// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
 //
 // =================================================================================================
 
 #include "routines/level3/xtrsm.hpp"
+#include "routines/levelx/xinvert.hpp"
 
 #include <string>
 #include <vector>
@@ -25,6 +29,7 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
     Xgemm<T>(queue, event, name) {
 }
 
+
 // =================================================================================================
 
 // The main routine
@@ -36,27 +41,153 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
                       const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                       const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
 
+  // Settings
+  constexpr auto block_size = size_t{32}; // tuneable
+
   // Makes sure all dimensions are larger than zero
   if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
 
   // Computes the k dimension. This is based on whether or not matrix is A (on the left)
   // or B (on the right) in the Xgemm routine.
-  auto k = (side == Side::kLeft) ? m : n;
+  const auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the triangular A matrix
   TestMatrixA(k, k, a_buffer, a_offset, a_ld);
 
-  // Checks for validity of the input/output B matrix
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+  // Checks for validity of the input B matrix
   const auto b_one = (layout == Layout::kRowMajor) ? n : m;
   const auto b_two = (layout == Layout::kRowMajor) ? m : n;
   TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
 
   // Creates a copy of B to avoid overwriting input in GEMM while computing output
-  const auto b_size = (b_ld * (b_two - 1) + b_one + b_offset);
-  auto b_buffer_copy = Buffer<T>(context_, b_size);
-  b_buffer.CopyTo(queue_, b_size, b_buffer_copy);
-
-  // TODO: Implement TRSM computation
+  const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
+  const auto x_one = b_one;
+  const auto x_size = b_size;
+  const auto x_ld = b_ld;
+  const auto x_offset = b_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Temporary buffer for the inverse of the A matrix
+  const auto a_inv_size = Ceil(k, block_size) * block_size;
+  auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM");
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList,
+             x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
+  fill_matrix_event.WaitForCompletion();
+
+  // Inverts the diagonal blocks
+  auto diagonal_invert_event = Event();
+  auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
+  inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+                                      k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
+  diagonal_invert_event.WaitForCompletion();
+
+  // Lower of upper triangular
+  const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+                          (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+
+  // Left side
+  if (side == Side::kLeft) {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      for (auto i = size_t{0}; i < m; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(m - i, block_size);
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               current_block_size, n, current_block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i + block_size >= m) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               m - i - block_size, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, i + block_size, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
+      const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               block_size, n, block_size, gemm_alpha,
+               a_inv_buffer, i * block_size, block_size,
+               b_buffer, i, b_ld, ConstantZero<T>(),
+               x_buffer, i, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
+        DoGemm(layout, a_transpose, Transpose::kNo,
+               i, n, block_size, ConstantNegOne<T>(),
+               a_buffer, this_a_offset, a_ld,
+               x_buffer, i, x_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+  }
+
+  // Right side
+  else {
+
+    // True when (lower triangular) or (upper triangular and transposed)
+    if (condition) {
+      const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
+      const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
+      for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+        const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, block_size, block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i - static_cast<int>(block_size) < 0) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, i, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, 0, b_ld);
+      }
+    }
+
+    // True when (upper triangular) or (lower triangular and transposed)
+    else {
+      for (auto i = size_t{0}; i < n; i += block_size) {
+        const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+        const auto current_block_size = std::min(n - i, block_size);
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, current_block_size, current_block_size, gemm_alpha,
+               b_buffer, i * b_ld, b_ld,
+               a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+               x_buffer, i * x_ld, x_ld);
+        if (i + block_size >= n) { break; }
+        const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
+        DoGemm(layout, Transpose::kNo, a_transpose,
+               m, n - i - block_size, block_size, ConstantNegOne<T>(),
+               x_buffer, i * x_ld, x_ld,
+               a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+               b_buffer, (i + block_size) * b_ld, b_ld);
+      }
+    }
+  }
+
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, b_size, b_buffer);
 }
 
 // =================================================================================================
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5ffba958..ffee9b7c 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -27,6 +27,7 @@ namespace clblast {
 template <typename T>
 Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/invert_diagonal_blocks.opencl"
     }) {
 }
@@ -91,8 +92,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
   const auto local = std::vector<size_t>{internal_block_size};
   const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
   auto base_kernel_event = Event();
-  RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
-  event_wait_list.push_back(base_kernel_event);
+  auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+  if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
 
   // Builds up block_size x block_size blocks. For example, internal_block_size=16:
   // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
@@ -130,8 +132,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
     kernel2.SetArgument(3, static_cast<int>(npages));
     kernel2.SetArgument(4, static_cast<int>(block_size));
     auto kernel2_event = Event();
-    auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
-    RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+    auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
     if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
 
     // Exit in case we reach beyond the bounds of the input matrix
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index b8e011c5..663fdffa 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -94,6 +94,30 @@ double2 ConstantOne() {
   return {1.0, 0.0};
 }
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne() {
+  return static_cast<T>(-1.0);
+}
+template float ConstantNegOne<float>();
+template double ConstantNegOne<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantNegOne() {
+  return FloatToHalf(-1.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantNegOne() {
+  return {-1.0f, 0.0f};
+}
+template <>
+double2 ConstantNegOne() {
+  return {-1.0, 0.0};
+}
+
 // =================================================================================================
 
 // Implements the string conversion using std::to_string if possible
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 26dee23b..3e408bb7 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -110,6 +110,10 @@ T ConstantZero();
 template <typename T>
 T ConstantOne();
 
+// Returns a scalar of value -1
+template <typename T>
+T ConstantNegOne();
+
 // =================================================================================================
 
 // Structure containing all possible arguments for test clients, including their default values
diff --git a/test/routines/level3/xtrsm.hpp b/test/routines/level3/xtrsm.hpp
index 2480a817..e59c96ff 100644
--- a/test/routines/level3/xtrsm.hpp
+++ b/test/routines/level3/xtrsm.hpp
@@ -48,12 +48,12 @@ class TestXtrsm {
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeA(const Arguments<T> &args) {
-    auto k = (args.side == Side::kLeft) ? args.m : args.n;
+    const auto k = (args.side == Side::kLeft) ? args.m : args.n;
     return k * args.a_ld + args.a_offset;
   }
   static size_t GetSizeB(const Arguments<T> &args) {
-    auto b_rotated = (args.layout == Layout::kRowMajor);
-    auto b_two = (b_rotated) ? args.m : args.n;
+    const auto b_rotated = (args.layout == Layout::kRowMajor);
+    const auto b_two = (b_rotated) ? args.m : args.n;
     return b_two * args.b_ld + args.b_offset;
   }