From 7c73ceb095f22099067b8a880961cdcbee4257ea Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 29 Jan 2017 17:02:00 +0100
Subject: Added first (incomplete) version of TRSV routine

---
 src/kernels/level2/xtrsv.opencl |  94 ++++++++++++++++++++++++++++
 src/routines/common.hpp         |  19 ++++++
 src/routines/level2/xgemv.cpp   |   1 +
 src/routines/level2/xtrsv.cpp   | 131 +++++++++++++++++++++++++++++++++++++---
 src/routines/level2/xtrsv.hpp   |  17 +++++-
 5 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 src/kernels/level2/xtrsv.opencl

diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..00f29e47
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+                __global real* restrict dest, const real_arg arg_value) {
+  const real value = GetRealArg(arg_value);
+  const int tid = get_global_id(0);
+  if (tid < n) {
+    dest[tid*inc + offset] = value;
+  }
+}
+
+// =================================================================================================
+
+// TODO: Put variable in database
+#define TRSV_BLOCK_SIZE 256
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_forward(int n,
+                  const __global float *A, const int a_offset, int lda,
+                  __global float *b, const int b_offset, int b_inc,
+                  __global float *x, const int x_offset, int x_inc,
+                  const int is_transposed, const int is_unit_diagonal) {
+  __local float sx[TRSV_BLOCK_SIZE];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = 0; i < n; ++i) {
+    real sum = b[i*b_inc + b_offset];
+    for (int j = 0; j < i; ++j) {
+      real a_value;
+      if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
+      else { a_value = A[j + i*lda + a_offset]; }
+      sum -= a_value * sx[j];
+    }
+    sum -= x[i*x_inc + x_offset];
+    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+    sx[i] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  for (int i = 0; i < n; ++i) {
+    x[i*x_inc + x_offset] = sx[i];
+  }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_backward(int n,
+                   const __global float *A, const int a_offset, int lda,
+                   __global float *b, const int b_offset, int b_inc,
+                   __global float *x, const int x_offset, int x_inc,
+                   const int is_trans, const int is_unit_diagonal) {
+  __local float sx[TRSV_BLOCK_SIZE];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  for (int i = n - 1; i >= 0; --i) {
+    real sum = b[i*b_inc + b_offset];
+    for (int j = i + 1; j < n; ++j) {
+      real a_value;
+      if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
+      else { a_value = A[j + i*lda + a_offset]; }
+      sum -= a_value * sx[j];
+    }
+    sum -= x[i*x_inc + x_offset];
+    if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+    sx[i] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  for (int i = 0; i < n; ++i) {
+    x[i*x_inc + x_offset] = sx[i];
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 47f98de0..8046c0be 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -52,6 +52,25 @@ void FillMatrix(Queue &queue, const Device &device,
   RunKernel(kernel, queue, device, global, local, event, waitForEvents);
 }
 
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t inc, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillVector");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(inc));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{64};
+  auto global = std::vector<size_t>{Ceil(n, 64)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
 // =================================================================================================
 
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 7b4c2e8f..26eed5e0 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -25,6 +25,7 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
     Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
+    #include "../../kernels/level2/xtrsv.opencl"
     }) {
 }
 
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index d5d5a7ca..93f4174f 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -25,6 +25,59 @@ Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
     Xgemv<T>(queue, event, name) {
 }
 
+// TODO: Temporary variable, put in database instead
+constexpr size_t TRSV_BLOCK_SIZE = 9;
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+                            const Transpose a_transpose, const Diagonal diagonal,
+                            const size_t n,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                            const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+  if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); };
+
+  // Retrieves the program from the cache
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
+
+  // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+  const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+
+  // The data is either in the upper or lower triangle
+  auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+                   (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+  // Retrieves the kernel from the compiled binary
+  const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+  auto kernel = Kernel(program, kernel_name);
+
+  // Sets the kernel arguments
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, a_buffer());
+  kernel.SetArgument(2, static_cast<int>(a_offset));
+  kernel.SetArgument(3, static_cast<int>(a_ld));
+  kernel.SetArgument(4, b_buffer());
+  kernel.SetArgument(5, static_cast<int>(b_offset));
+  kernel.SetArgument(6, static_cast<int>(b_inc));
+  kernel.SetArgument(7, x_buffer());
+  kernel.SetArgument(8, static_cast<int>(x_offset));
+  kernel.SetArgument(9, static_cast<int>(x_inc));
+  kernel.SetArgument(10, static_cast<int>(is_transposed));
+  kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+
+  // Launches the kernel
+  const auto local = std::vector<size_t>{1};
+  const auto global = std::vector<size_t>{1};
+  auto event = Event();
+  RunKernel(kernel, queue_, device_, global, local, event.pointer());
+  event.WaitForCompletion();
+}
+
 // =================================================================================================
 
 // The main routine
@@ -33,24 +86,84 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
                       const Transpose a_transpose, const Diagonal diagonal,
                       const size_t n,
                       const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                      const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
 
   // Makes sure all dimensions are larger than zero
   if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
 
   // Tests the matrix and vector
   TestMatrixA(n, n, a_buffer, a_offset, a_ld);
-  TestVectorX(n, x_buffer, x_offset, x_inc);
+  TestVectorX(n, b_buffer, b_offset, b_inc);
 
-  // Creates a copy of X: a temporary scratch buffer
-  auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
-  x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+  // Retrieves the program from the cache
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
 
-  // The data is either in the upper or lower triangle
-  size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
-                     (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  // Creates a copy of B to avoid overwriting input while computing output
+  // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+  const auto x_offset = b_offset;
+  const auto x_inc = b_inc;
+  const auto x_size = n*x_inc + x_offset;
+  auto x_buffer = Buffer<T>(context_, x_size);
+  b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+  // Fills the output buffer with zeros
+  auto eventWaitList = std::vector<Event>();
+  auto fill_vector_event = Event();
+  FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList,
+             n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+  fill_vector_event.WaitForCompletion();
+
+  // TODO: Not working for row-major at the moment
+  const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) ||
+                              (a_transpose != Transpose::kNo && layout != Layout::kRowMajor));
+
+  // Loops over the blocks
+  auto col = n; // the initial column position
+  for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) {
+    const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i);
+
+    if (!is_transposed) {
+
+      // Sets the offsets for upper or lower triangular
+      col = (triangle == Triangle::kUpper) ? col - block_size : i;
+      const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col;
+      const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0;
+      const auto extra_offset_b = col*x_inc;
+
+      // Runs the GEMV routine to compute x' = A * x
+      if (i > 0) {
+        DoGemv(layout, a_transpose, block_size, i, ConstantOne<T>(),
+               a_buffer, a_offset + extra_offset_a, a_ld,
+               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+               x_buffer, x_offset + extra_offset_b, x_inc );
+      }
+    }
+    else {
+
+      // Sets the offsets for upper or lower triangular
+      col = (triangle == Triangle::kLower) ? col - block_size : i;
+      const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld;
+      const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0;
+      const auto extra_offset_b = col*x_inc;
+
+      // Runs the GEMV routine to compute x' = A * x
+      if (i > 0) {
+        DoGemv(layout, a_transpose, i, block_size, ConstantOne<T>(),
+               a_buffer, a_offset + extra_offset_a, a_ld,
+               x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+               x_buffer, x_offset + extra_offset_b, x_inc );
+      }
+    }
+
+    // Runs the triangular substitution for the block size
+    Substitution(layout, triangle, a_transpose, diagonal, block_size,
+                 a_buffer, a_offset + col + col*a_ld, a_ld,
+                 b_buffer, b_offset + col*b_inc, b_inc,
+                 x_buffer, x_offset + col*x_inc, x_inc);
+  }
 
-  // TODO: Implement the routine
+  // Retrieves the results
+  x_buffer.CopyTo(queue_, x_size, b_buffer);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index 4a73b5eb..dc3f32f0 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -7,7 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file implements the Xtrsv routine.
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
 //
 // =================================================================================================
 
@@ -25,9 +27,12 @@ class Xtrsv: public Xgemv<T> {
  public:
 
   // Uses the generic matrix-vector routine
+  using Xgemv<T>::routine_name_;
   using Xgemv<T>::queue_;
   using Xgemv<T>::context_;
-  using Xgemv<T>::MatVec;
+  using Xgemv<T>::device_;
+  using Xgemv<T>::db_;
+  using Xgemv<T>::DoGemv;
 
   // Constructor
   Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
@@ -38,6 +43,14 @@ class Xtrsv: public Xgemv<T> {
               const size_t n,
               const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
               const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+  // Performs forward or backward substitution on a small triangular matrix
+  void Substitution(const Layout layout, const Triangle triangle,
+                    const Transpose a_transpose, const Diagonal diagonal,
+                    const size_t n,
+                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+                    const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
 };
 
 // =================================================================================================
-- 
cgit v1.2.3