diff options
Diffstat (limited to 'src/routines/level3/xtrsm.cpp')
-rw-r--r-- | src/routines/level3/xtrsm.cpp | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp new file mode 100644 index 00000000..3a910261 --- /dev/null +++ b/src/routines/level3/xtrsm.cpp @@ -0,0 +1,202 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/level3/xtrsm.hpp" +#include "routines/levelx/xinvert.hpp" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name): + Xgemm<T>(queue, event, name) { +} + + +// ================================================================================================= + +// The main routine +template <typename T> +void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Settings + constexpr auto block_size = size_t{32}; // tuneable + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + const auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + TestMatrixA(k, k, a_buffer, a_offset, a_ld); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Checks for validity of the input B matrix + const auto b_one = (layout == Layout::kRowMajor) ? n : m; + const auto b_two = (layout == Layout::kRowMajor) ? m : n; + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + + // Creates a copy of B to avoid overwriting input in GEMM while computing output + const auto b_size = b_ld * (b_two - 1) + b_one + b_offset; + const auto x_one = b_one; + const auto x_size = b_size; + const auto x_ld = b_ld; + const auto x_offset = b_offset; + auto x_buffer = Buffer<T>(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Temporary buffer for the inverse of the A matrix + const auto a_inv_size = Ceil(k, block_size) * block_size; + auto a_inv_buffer = Buffer<T>(context_, a_inv_size); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector<Event>(); + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, + x_one, x_ld, x_offset, x_buffer, ConstantZero<T>()); + fill_matrix_event.WaitForCompletion(); + + // Inverts the diagonal blocks + auto diagonal_invert_event = Event(); + auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer()); + inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal, + k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer); + diagonal_invert_event.WaitForCompletion(); + + // Lower of upper triangular + const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose == Transpose::kNo)); + + // Left side + if (side == Side::kLeft) { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + for (auto i = size_t{0}; i < m; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); + const auto current_block_size = std::min(m - i, block_size); + DoGemm(layout, a_transpose, Transpose::kNo, + current_block_size, n, current_block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero<T>(), + x_buffer, i, x_ld); + if (i + block_size >= m) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld; + DoGemm(layout, a_transpose, Transpose::kNo, + m - i - block_size, n, block_size, ConstantNegOne<T>(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne<T>(), + b_buffer, i + block_size, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size); + const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); + DoGemm(layout, a_transpose, Transpose::kNo, + block_size, n, block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero<T>(), + x_buffer, i, x_ld); + if (i - static_cast<int>(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i; + DoGemm(layout, a_transpose, Transpose::kNo, + i, n, block_size, ConstantNegOne<T>(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne<T>(), + b_buffer, 0, b_ld); + } + } + } + + // Right side + else { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size); + const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); + DoGemm(layout, Transpose::kNo, a_transpose, + m, block_size, block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), + x_buffer, i * x_ld, x_ld); + if (i - static_cast<int>(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld; + DoGemm(layout, Transpose::kNo, a_transpose, + m, i, block_size, ConstantNegOne<T>(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne<T>(), + b_buffer, 0, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + for (auto i = size_t{0}; i < n; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); + const auto current_block_size = std::min(n - i, block_size); + DoGemm(layout, Transpose::kNo, a_transpose, + m, current_block_size, current_block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), + x_buffer, i * x_ld, x_ld); + if (i + block_size >= n) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld; + DoGemm(layout, Transpose::kNo, a_transpose, + m, n - i - block_size, block_size, ConstantNegOne<T>(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne<T>(), + b_buffer, (i + block_size) * b_ld, b_ld); + } + } + } + + // Retrieves the results + x_buffer.CopyTo(queue_, b_size, b_buffer); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrsm<half>; +template class Xtrsm<float>; +template class Xtrsm<double>; +template class Xtrsm<float2>; +template class Xtrsm<double2>; + +// ================================================================================================= +} // namespace clblast |