summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-02-24 21:08:44 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-02-24 21:08:44 +0100
commit2f2a510c38c811b53474dd8cc1a3dfff88053cf0 (patch)
treee8bede0cf1fa7824c83d9c43bb63991535fa20cb /src
parent1e5b5157bcc9ed1630ce731b3ce41ed7712d951d (diff)
Implemented a simple row-major to col-major problem conversion for TRSM
Diffstat (limited to 'src')
-rw-r--r--src/routines/level3/xtrsm.cpp77
-rw-r--r--src/routines/level3/xtrsm.hpp12
2 files changed, 62 insertions, 27 deletions
diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp
index 8fe33b64..42855362 100644
--- a/src/routines/level3/xtrsm.cpp
+++ b/src/routines/level3/xtrsm.cpp
@@ -10,7 +10,7 @@
// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based
// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular
// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek,
-// and Jack Dongarra.
+// and Jack Dongarra and the OpenCL implementation in clBLAS.
//
// =================================================================================================
@@ -29,18 +29,49 @@ Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name):
Xgemm<T>(queue, event, name) {
}
-
// =================================================================================================
-// The main routine
+// The entry point: transforming into col-major (if needed) and then running the col-major version
template <typename T>
-void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+void Xtrsm<T>::DoTrsm(const Layout layout, Side side, Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
+ size_t m, size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+ // Converts row-major to a col-major problem:
+ // The idea is that
+ // B = A*X
+ // can be computed as
+ // B' = (A*X)' = X'*A'
+ // Since changing the order is basically a transpose on each matrix, the formula becomes:
+ // B = X*A
+ // So only the side (left/right) and the triangle (upper/lower) are changed and M/N are swapped
+ if (layout == Layout::kRowMajor) {
+ std::swap(m, n);
+ side = (side == Side::kLeft) ? Side::kRight : Side::kLeft;
+ triangle = (triangle == Triangle::kLower) ? Triangle::kUpper : Triangle::kLower;
+ }
+
+ // Runs the col-major version of TRSM
+ TrsmColMajor(side, triangle, a_transpose, diagonal,
+ m, n, alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld);
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+void Xtrsm<T>::TrsmColMajor(const Side side, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
// Settings
constexpr auto block_size = size_t{32}; // tuneable
@@ -55,13 +86,11 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
TestMatrixA(k, k, a_buffer, a_offset, a_ld);
// Checks for validity of the input B matrix
- const auto b_one = (layout == Layout::kRowMajor) ? n : m;
- const auto b_two = (layout == Layout::kRowMajor) ? m : n;
- TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+ TestMatrixB(m, n, b_buffer, b_offset, b_ld);
// Creates a copy of B to avoid overwriting input in GEMM while computing output
- const auto b_size = b_ld * (b_two - 1) + b_one + b_offset;
- const auto x_one = b_one;
+ const auto b_size = b_ld * (n - 1) + m + b_offset;
+ const auto x_one = m;
const auto x_size = b_size;
const auto x_ld = b_ld;
const auto x_offset = b_offset;
@@ -82,32 +111,30 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
// Inverts the diagonal blocks
auto diagonal_invert_event = Event();
auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer());
- inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal,
+ inverter.InvertMatrixDiagonalBlocks(Layout::kColMajor, triangle, diagonal,
k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer);
diagonal_invert_event.WaitForCompletion();
// Derives properties based on the arguments
- const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
- (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
- const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) ||
- (layout != Layout::kColMajor && a_transpose != Transpose::kNo));
+ const auto condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) ||
+ (triangle == Triangle::kLower && a_transpose == Transpose::kNo));
// Left side
if (side == Side::kLeft) {
// True when (lower triangular) or (upper triangular and transposed)
- if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
+ if (condition) {
for (auto i = size_t{0}; i < m; i += block_size) {
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
const auto current_block_size = std::min(m - i, block_size);
- DoGemm(layout, a_transpose, Transpose::kNo,
+ DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
current_block_size, n, current_block_size, gemm_alpha,
a_inv_buffer, i * block_size, block_size,
b_buffer, i, b_ld, ConstantZero<T>(),
x_buffer, i, x_ld);
if (i + block_size >= m) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld;
- DoGemm(layout, a_transpose, Transpose::kNo,
+ DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
m - i - block_size, n, block_size, ConstantNegOne<T>(),
a_buffer, this_a_offset, a_ld,
x_buffer, i, x_ld, ConstantOne<T>(),
@@ -121,14 +148,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size);
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
- DoGemm(layout, a_transpose, Transpose::kNo,
+ DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
current_block_size, n, current_block_size, gemm_alpha,
a_inv_buffer, i * block_size, block_size,
b_buffer, i, b_ld, ConstantZero<T>(),
x_buffer, i, x_ld);
if (i - static_cast<int>(block_size) < 0) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i;
- DoGemm(layout, a_transpose, Transpose::kNo,
+ DoGemm(Layout::kColMajor, a_transpose, Transpose::kNo,
i, n, block_size, ConstantNegOne<T>(),
a_buffer, this_a_offset, a_ld,
x_buffer, i, x_ld, ConstantOne<T>(),
@@ -141,19 +168,19 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
else {
// True when (lower triangular) or (upper triangular and transposed)
- if ((!is_upper && !is_transposed) || (is_upper && is_transposed)) {
+ if (condition) {
const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size);
const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size);
for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) {
const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>();
- DoGemm(layout, Transpose::kNo, a_transpose,
+ DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, current_block_size, current_block_size, gemm_alpha,
b_buffer, i * b_ld, b_ld,
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
x_buffer, i * x_ld, x_ld);
if (i - static_cast<int>(block_size) < 0) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld;
- DoGemm(layout, Transpose::kNo, a_transpose,
+ DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, i, current_block_size, ConstantNegOne<T>(),
x_buffer, i * x_ld, x_ld,
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
@@ -166,14 +193,14 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian
for (auto i = size_t{0}; i < n; i += block_size) {
const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>();
const auto current_block_size = std::min(n - i, block_size);
- DoGemm(layout, Transpose::kNo, a_transpose,
+ DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, current_block_size, current_block_size, gemm_alpha,
b_buffer, i * b_ld, b_ld,
a_inv_buffer, i * block_size, block_size, ConstantZero<T>(),
x_buffer, i * x_ld, x_ld);
if (i + block_size >= n) { break; }
const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld;
- DoGemm(layout, Transpose::kNo, a_transpose,
+ DoGemm(Layout::kColMajor, Transpose::kNo, a_transpose,
m, n - i - block_size, block_size, ConstantNegOne<T>(),
x_buffer, i * x_ld, x_ld,
a_buffer, this_a_offset, a_ld, ConstantOne<T>(),
diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp
index b9d5432a..5b42398e 100644
--- a/src/routines/level3/xtrsm.hpp
+++ b/src/routines/level3/xtrsm.hpp
@@ -37,12 +37,20 @@ class Xtrsm: public Xgemm<T> {
Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM");
// Templated-precision implementation of the routine
- void DoTrsm(const Layout layout, const Side side, const Triangle triangle,
+ void DoTrsm(const Layout layout, Side side, Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
+ size_t m, size_t n,
const T alpha,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+
+ // Implementation of the column-major version
+ void TrsmColMajor(const Side side, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
};
// =================================================================================================