diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-24 21:08:44 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-24 21:08:44 +0100 |
commit | 2f2a510c38c811b53474dd8cc1a3dfff88053cf0 (patch) | |
tree | e8bede0cf1fa7824c83d9c43bb63991535fa20cb /src/routines/level3/xtrsm.cpp | |
parent | 1e5b5157bcc9ed1630ce731b3ce41ed7712d951d (diff) |
Implemented a simple row-major to col-major problem conversion for TRSM
Diffstat (limited to 'src/routines/level3/xtrsm.cpp')
-rw-r--r-- | src/routines/level3/xtrsm.cpp | 77 |
1 files changed, 52 insertions, 25 deletions
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 8fe33b64..42855362 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -10,7 +10,7 @@ // This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based // on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular // Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, -// and Jack Dongarra. +// and Jack Dongarra and the OpenCL implementation in clBLAS. // // ================================================================================================= @@ -29,18 +29,49 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name): Xgemm<T>(queue, event, name) { } - // ================================================================================================= -// The main routine +// The entry point: transforming into col-major (if needed) and then running the col-major version template <typename T> -void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle, +void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, + size_t m, size_t n, const T alpha, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { + // Converts row-major to a col-major problem: + // The idea is that + // B = A*X + // can be computed as + // B' = (A*X)' = X'*A' + // Since changing the order is basically a transpose on each matrix, the formula becomes: + // B = X*A + // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped + if (layout == Layout::kRowMajor) { + std::swap(m, n); + side = (side == Side::kLeft) ? Side::kRight : Side::kLeft; + triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower; + } + + // Runs the col-major version of TRSM + TrsmColMajor(side, triangle, a_transpose, diagonal, + m, n, alpha, + a_buffer, a_offset, a_ld, + b_buffer, b_offset, b_ld); +} + +// ================================================================================================= + +// The main routine +template <typename T> +void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { + // Settings constexpr auto block_size = size_t{32}; // tuneable @@ -55,13 +86,11 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian TestMatrixA(k, k, a_buffer, a_offset, a_ld); // Checks for validity of the input B matrix - const auto b_one = (layout == Layout::kRowMajor) ? n : m; - const auto b_two = (layout == Layout::kRowMajor) ? m : n; - TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + TestMatrixB(m, n, b_buffer, b_offset, b_ld); // Creates a copy of B to avoid overwriting input in GEMM while computing output - const auto b_size = b_ld * (b_two - 1) + b_one + b_offset; - const auto x_one = b_one; + const auto b_size = b_ld * (n - 1) + m + b_offset; + const auto x_one = m; const auto x_size = b_size; const auto x_ld = b_ld; const auto x_offset = b_offset; @@ -82,32 +111,30 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian // Inverts the diagonal blocks auto diagonal_invert_event = Event(); auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer()); - inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal, + inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal, k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer); diagonal_invert_event.WaitForCompletion(); // Derives properties based on the arguments - const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || - (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); - const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) || - (layout != Layout::kColMajor && a_transpose != Transpose::kNo)); + const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose == Transpose::kNo)); // Left side if (side == Side::kLeft) { // True when (lower triangular) or (upper triangular and transposed) - if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) { + if (condition) { for (auto i = size_t{0}; i < m; i += block_size) { const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); const auto current_block_size = std::min(m - i, block_size); - DoGemm(layout, a_transpose, Transpose::kNo, + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, current_block_size, n, current_block_size, gemm_alpha, a_inv_buffer, i * block_size, block_size, b_buffer, i, b_ld, ConstantZero<T>(), x_buffer, i, x_ld); if (i + block_size >= m) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld; - DoGemm(layout, a_transpose, Transpose::kNo, + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, m - i - block_size, n, block_size, ConstantNegOne<T>(), a_buffer, this_a_offset, a_ld, x_buffer, i, x_ld, ConstantOne<T>(), @@ -121,14 +148,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size); for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); - DoGemm(layout, a_transpose, Transpose::kNo, + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, current_block_size, n, current_block_size, gemm_alpha, a_inv_buffer, i * block_size, block_size, b_buffer, i, b_ld, ConstantZero<T>(), x_buffer, i, x_ld); if (i - static_cast<int>(block_size) < 0) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i; - DoGemm(layout, a_transpose, Transpose::kNo, + DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo, i, n, block_size, ConstantNegOne<T>(), a_buffer, this_a_offset, a_ld, x_buffer, i, x_ld, ConstantOne<T>(), @@ -141,19 +168,19 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian else { // True when (lower triangular) or (upper triangular and transposed) - if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) { + if (condition) { const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size); const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size); for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); - DoGemm(layout, Transpose::kNo, a_transpose, + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, current_block_size, current_block_size, gemm_alpha, b_buffer, i * b_ld, b_ld, a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), x_buffer, i * x_ld, x_ld); if (i - static_cast<int>(block_size) < 0) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld; - DoGemm(layout, Transpose::kNo, a_transpose, + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, i, current_block_size, ConstantNegOne<T>(), x_buffer, i * x_ld, x_ld, a_buffer, this_a_offset, a_ld, ConstantOne<T>(), @@ -166,14 +193,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian for (auto i = size_t{0}; i < n; i += block_size) { const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); const auto current_block_size = std::min(n - i, block_size); - DoGemm(layout, Transpose::kNo, a_transpose, + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, current_block_size, current_block_size, gemm_alpha, b_buffer, i * b_ld, b_ld, a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), x_buffer, i * x_ld, x_ld); if (i + block_size >= n) { break; } const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld; - DoGemm(layout, Transpose::kNo, a_transpose, + DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose, m, n - i - block_size, block_size, ConstantNegOne<T>(), x_buffer, i * x_ld, x_ld, a_buffer, this_a_offset, a_ld, ConstantOne<T>(), |