summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-01-18 21:29:59 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-01-18 21:29:59 +0100
commitdf9a77d74d87fb8832264e9e9a37336001873151 (patch)
tree516e113140164daa0d918803dee64b94b685afb6 /src/routines
parent4b3ffd998904f5c848edc5917308f5942fa71da3 (diff)
Added first version of the TRSM routine based on the diagonal invert kernel
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/level3/xtrsm.cpp147
-rw-r--r--src/routines/levelx/xinvert.cpp10
2 files changed, 145 insertions, 12 deletions
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 0ac1a58e..8061b508 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -7,11 +7,15 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file implements the Xtrsm class (see the header for information about the class).
+// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
+// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
+// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
+// and Jack Dongarra.
//
// =================================================================================================
#include "routines/level3/xtrsm.hpp"
+#include "routines/levelx/xinvert.hpp"
#include <string>
#include <vector>
@@ -25,6 +29,7 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
+
// =================================================================================================
// The main routine
@@ -36,27 +41,153 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+ // Settings
+ constexpr auto block_size = size_t{32}; // tuneable
+
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
// Computes the k dimension. This is based on whether or not matrix is A (on the left)
// or B (on the right) in the Xgemm routine.
- auto k = (side == Side::kLeft) ? m : n;
+ const auto k = (side == Side::kLeft) ? m : n;
// Checks for validity of the triangular A matrix
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
- // Checks for validity of the input/output B matrix
+ // Determines which kernels to run based on the layout (the kernels assume column-major as
+ // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
+ const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
+ (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+
+ // Checks for validity of the input B matrix
const auto b_one = (layout == Layout::kRowMajor) ? n : m;
const auto b_two = (layout == Layout::kRowMajor) ? m : n;
TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
// Creates a copy of B to avoid overwriting input in GEMM while computing output
- const auto b_size = (b_ld * (b_two - 1) + b_one + b_offset);
- auto b_buffer_copy = Buffer<T>(context_, b_size);
- b_buffer.CopyTo(queue_, b_size, b_buffer_copy);
-
- // TODO: Implement TRSM computation
+ const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
+ const auto x_one = b_one;
+ const auto x_size = b_size;
+ const auto x_ld = b_ld;
+ const auto x_offset = b_offset;
+ auto x_buffer = Buffer<T>(context_, x_size);
+ b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+ // Temporary buffer for the inverse of the A matrix
+ const auto a_inv_size = Ceil(k, block_size) * block_size;
+ auto a_inv_buffer = Buffer<T>(context_, a_inv_size);
+
+ // Fills the output buffer with zeros
+ auto eventWaitList = std::vector<Event>();
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM");
+ auto fill_matrix_event = Event();
+ FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList,
+ x_one, x_ld, x_offset, x_buffer, ConstantZero<T>());
+ fill_matrix_event.WaitForCompletion();
+
+ // Inverts the diagonal blocks
+ auto diagonal_invert_event = Event();
+ auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
+ inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+ k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
+ diagonal_invert_event.WaitForCompletion();
+
+ // Lower of upper triangular
+ const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+ (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
+
+ // Left side
+ if (side == Side::kLeft) {
+
+ // True when (lower triangular) or (upper triangular and transposed)
+ if (condition) {
+ for (auto i = size_t{0}; i < m; i += block_size) {
+ const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+ const auto current_block_size = std::min(m - i, block_size);
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ current_block_size, n, current_block_size, gemm_alpha,
+ a_inv_buffer, i * block_size, block_size,
+ b_buffer, i, b_ld, ConstantZero<T>(),
+ x_buffer, i, x_ld);
+ if (i + block_size >= m) { break; }
+ const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ m - i - block_size, n, block_size, ConstantNegOne<T>(),
+ a_buffer, this_a_offset, a_ld,
+ x_buffer, i, x_ld, ConstantOne<T>(),
+ b_buffer, i + block_size, b_ld);
+ }
+ }
+
+ // True when (upper triangular) or (lower triangular and transposed)
+ else {
+ const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size);
+ const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
+ for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+ const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ block_size, n, block_size, gemm_alpha,
+ a_inv_buffer, i * block_size, block_size,
+ b_buffer, i, b_ld, ConstantZero<T>(),
+ x_buffer, i, x_ld);
+ if (i - static_cast<int>(block_size) < 0) { break; }
+ const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
+ DoGemm(layout, a_transpose, Transpose::kNo,
+ i, n, block_size, ConstantNegOne<T>(),
+ a_buffer, this_a_offset, a_ld,
+ x_buffer, i, x_ld, ConstantOne<T>(),
+ b_buffer, 0, b_ld);
+ }
+ }
+ }
+
+ // Right side
+ else {
+
+ // True when (lower triangular) or (upper triangular and transposed)
+ if (condition) {
+ const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
+ const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
+ for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
+ const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, block_size, block_size, gemm_alpha,
+ b_buffer, i * b_ld, b_ld,
+ a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+ x_buffer, i * x_ld, x_ld);
+ if (i - static_cast<int>(block_size) < 0) { break; }
+ const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, i, block_size, ConstantNegOne<T>(),
+ x_buffer, i * x_ld, x_ld,
+ a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+ b_buffer, 0, b_ld);
+ }
+ }
+
+ // True when (upper triangular) or (lower triangular and transposed)
+ else {
+ for (auto i = size_t{0}; i < n; i += block_size) {
+ const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
+ const auto current_block_size = std::min(n - i, block_size);
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, current_block_size, current_block_size, gemm_alpha,
+ b_buffer, i * b_ld, b_ld,
+ a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
+ x_buffer, i * x_ld, x_ld);
+ if (i + block_size >= n) { break; }
+ const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
+ DoGemm(layout, Transpose::kNo, a_transpose,
+ m, n - i - block_size, block_size, ConstantNegOne<T>(),
+ x_buffer, i * x_ld, x_ld,
+ a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
+ b_buffer, (i + block_size) * b_ld, b_ld);
+ }
+ }
+ }
+
+ // Retrieves the results
+ x_buffer.CopyTo(queue_, b_size, b_buffer);
}
// =================================================================================================
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5ffba958..ffee9b7c 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -27,6 +27,7 @@ namespace clblast {
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+ #include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
}) {
}
@@ -91,8 +92,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto local = std::vector<size_t>{internal_block_size};
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
auto base_kernel_event = Event();
- RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
- event_wait_list.push_back(base_kernel_event);
+ auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+ RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+ if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
@@ -130,8 +132,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
kernel2.SetArgument(3, static_cast<int>(npages));
kernel2.SetArgument(4, static_cast<int>(block_size));
auto kernel2_event = Event();
- auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
- RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+ auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+ RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
// Exit in case we reach beyond the bounds of the input matrix