From 4b3ffd998904f5c848edc5917308f5942fa71da3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 15 Jan 2017 17:30:00 +0100
Subject: Added a first version of the diagonal block invert routine in
 preparation of TRSM

---
 src/routines/common.hpp         |  21 ++++++
 src/routines/levelx/xinvert.cpp | 152 ++++++++++++++++++++++++++++++++++++++++
 src/routines/levelx/xinvert.hpp |  40 +++++++++++
 3 files changed, 213 insertions(+)
 create mode 100644 src/routines/levelx/xinvert.cpp
 create mode 100644 src/routines/levelx/xinvert.hpp

(limited to 'src/routines')
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 53ca6355..47f98de0 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -33,6 +33,27 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 
 // =================================================================================================
 
+// Sets all elements of a matrix to a constant value
+template <typename T>
+void FillMatrix(Queue &queue, const Device &device,
+                const Program &program, const Database &,
+                EventPointer event, const std::vector<Event> &waitForEvents,
+                const size_t n, const size_t ld, const size_t offset,
+                const Buffer<T> &dest,
+                const T constant_value) {
+  auto kernel = Kernel(program, "FillMatrix");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, static_cast<int>(ld));
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, dest());
+  kernel.SetArgument(4, GetRealArg(constant_value));
+  auto local = std::vector<size_t>{8, 8};
+  auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)};
+  RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
+// =================================================================================================
+
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
new file mode 100644
index 00000000..5ffba958
--- /dev/null
+++ b/src/routines/levelx/xinvert.cpp
@@ -0,0 +1,152 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
+//
+// =================================================================================================
+
+#include "routines/levelx/xinvert.hpp"
+
+#include <string>
+#include <vector>
+#include <assert.h>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+    #include "../../kernels/level3/invert_diagonal_blocks.opencl"
+    }) {
+}
+
+// =================================================================================================
+
+// Inverts diagonal square blocks of a matrix
+template <typename T>
+void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                            const size_t n, const size_t block_size,
+                                            const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                            Buffer<T> &dest) {
+
+  // Makes sure all dimensions are larger than zero and the block size is smaller than n
+  if ((block_size == 0) || (n == 0) || (block_size > n)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Helper variables
+  const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]);
+  assert(internal_block_size == 16);
+  const auto num_blocks = CeilDiv(n, block_size);
+  const auto num_internal_blocks = CeilDiv(n, internal_block_size);
+  const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false;
+
+  // This routine only supports block sizes which are a multiple of the internal block size and
+  // block sizes up to and including 128
+  if ((block_size % internal_block_size != 0) || (block_size > 128)) {
+    throw BLASError(StatusCode::kInvalidDimension);
+  }
+
+  // Checks for validity of the source and destination matrices
+  TestMatrixA(n, n, src, offset, ld_src);
+  TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size);
+
+  // Determines which kernels to run based on the layout (the kernels assume column-major as
+  // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+  const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+                         (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+  const auto name_postfix = (is_upper) ? "Upper" : "Lower";
+
+  // Retrieves the program from the cache
+  auto event_wait_list = std::vector<Event>();
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT");
+
+  // Fills the output buffer with zeros
+  auto fill_matrix_event = Event();
+  FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list,
+             num_blocks * block_size, block_size, 0, dest, ConstantZero<T>());
+  event_wait_list.push_back(fill_matrix_event);
+
+  // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group
+  auto kernel = Kernel(program, "InvertDiagonalBlock");
+  kernel.SetArgument(0, static_cast<int>(n));
+  kernel.SetArgument(1, src());
+  kernel.SetArgument(2, static_cast<int>(offset));
+  kernel.SetArgument(3, static_cast<int>(ld_src));
+  kernel.SetArgument(4, dest());
+  kernel.SetArgument(5, static_cast<int>(block_size));
+  kernel.SetArgument(6, static_cast<int>(unit_diagonal));
+  kernel.SetArgument(7, static_cast<int>(is_upper));
+  const auto local = std::vector<size_t>{internal_block_size};
+  const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
+  auto base_kernel_event = Event();
+  RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
+  event_wait_list.push_back(base_kernel_event);
+
+  // Builds up block_size x block_size blocks. For example, internal_block_size=16:
+  // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
+  // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
+  // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
+  for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) {
+    assert(current_size == 16 || current_size == 32 || current_size == 64);
+
+    // Emulates a 3D grid: NX * (NY * npages)
+    const auto npages = CeilDiv(n, current_size*2);
+    const auto local0 = (current_size <= 32) ? current_size/4 : 16;
+    const auto local = std::vector<size_t>{local0, 4};
+    const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]};
+
+    // Part 1
+    auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix);
+    kernel1.SetArgument(0, static_cast<int>(n));
+    kernel1.SetArgument(1, src());
+    kernel1.SetArgument(2, static_cast<int>(offset));
+    kernel1.SetArgument(3, static_cast<int>(ld_src));
+    kernel1.SetArgument(4, dest());
+    kernel1.SetArgument(5, static_cast<int>(current_size));
+    kernel1.SetArgument(6, static_cast<int>(npages));
+    kernel1.SetArgument(7, static_cast<int>(block_size));
+    auto kernel1_event = Event();
+    RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list);
+    event_wait_list.push_back(kernel1_event);
+
+    // Part 2
+    const bool is_last_kernel = (current_size * 2 >= block_size);
+    auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix);
+    kernel2.SetArgument(0, static_cast<int>(n));
+    kernel2.SetArgument(1, dest());
+    kernel2.SetArgument(2, static_cast<int>(current_size));
+    kernel2.SetArgument(3, static_cast<int>(npages));
+    kernel2.SetArgument(4, static_cast<int>(block_size));
+    auto kernel2_event = Event();
+    auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+    RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+    if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
+
+    // Exit in case we reach beyond the bounds of the input matrix
+    if (current_size*2 >= n) { break; }
+  }
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xinvert<half>;
+template class Xinvert<float>;
+template class Xinvert<double>;
+template class Xinvert<float2>;
+template class Xinvert<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp
new file mode 100644
index 00000000..fa0a80e7
--- /dev/null
+++ b/src/routines/levelx/xinvert.hpp
@@ -0,0 +1,40 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the common code to perform (partial) matrix inverting.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XINVERT_H_
+#define CLBLAST_ROUTINES_XINVERT_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+template <typename T>
+class Xinvert: public Routine {
+ public:
+
+  // Constructor
+  Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT");
+
+  // Inverts diagonal square blocks of a matrix
+  void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag,
+                                  const size_t n, const size_t block_size,
+                                  const Buffer<T> &src, const size_t offset, const size_t ld_src,
+                                  Buffer<T> &dest);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XINVERT_H_
+#endif
-- 
cgit v1.2.3