diff options
Diffstat (limited to 'src/routines')
-rw-r--r-- | src/routines/common.hpp | 40 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cpp | 3 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.cpp | 161 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.hpp | 60 | ||||
-rw-r--r-- | src/routines/level3/xtrsm.cpp | 202 | ||||
-rw-r--r-- | src/routines/level3/xtrsm.hpp | 52 | ||||
-rw-r--r-- | src/routines/levelx/xinvert.cpp | 151 | ||||
-rw-r--r-- | src/routines/levelx/xinvert.hpp | 40 |
8 files changed, 708 insertions, 1 deletions
diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 7c211c0d..bdea0086 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -33,6 +33,46 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // ================================================================================================= +// Sets all elements of a matrix to a constant value +template <typename T> +void FillMatrix(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t n, const size_t ld, const size_t offset, + const Buffer<T> &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillMatrix"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, static_cast<int>(ld)); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector<size_t>{8, 8}; + auto global = std::vector<size_t>{Ceil(ld, 8), Ceil(n, 8)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// Sets all elements of a vector to a constant value +template <typename T> +void FillVector(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t n, const size_t inc, const size_t offset, + const Buffer<T> &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillVector"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, static_cast<int>(inc)); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector<size_t>{64}; + auto global = std::vector<size_t>{Ceil(n, 64)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// ================================================================================================= + // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template <typename T> diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 9e9c2db4..7d2e5f60 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -22,9 +22,10 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name): - Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, { + Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot", "Xtrsv"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" + #include "../../kernels/level2/xtrsv.opencl" }) { } diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp new file mode 100644 index 00000000..d5d009ff --- /dev/null +++ b/src/routines/level2/xtrsv.cpp @@ -0,0 +1,161 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrsv class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/level2/xtrsv.hpp" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name): + Xgemv<T>(queue, event, name) { +} + +// ================================================================================================= + +template <typename T> +void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; + + // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel + const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; + const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0; + + // The data is either in the upper or lower triangle + const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + + // Retrieves the kernel from the compiled binary + const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; + auto kernel = Kernel(program_, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, a_buffer()); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, static_cast<int>(a_ld)); + kernel.SetArgument(4, b_buffer()); + kernel.SetArgument(5, static_cast<int>(b_offset)); + kernel.SetArgument(6, static_cast<int>(b_inc)); + kernel.SetArgument(7, x_buffer()); + kernel.SetArgument(8, static_cast<int>(x_offset)); + kernel.SetArgument(9, static_cast<int>(x_inc)); + kernel.SetArgument(10, static_cast<int>(is_transposed)); + kernel.SetArgument(11, static_cast<int>(is_unit_diagonal)); + kernel.SetArgument(12, static_cast<int>(do_conjugate)); + + // Launches the kernel + const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]}; + const auto global = std::vector<size_t>{1}; + auto event = Event(); + RunKernel(kernel, queue_, device_, global, local, event.pointer()); + event.WaitForCompletion(); +} + +// ================================================================================================= + +// The main routine +template <typename T> +void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) { + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the matrix and vector + TestMatrixA(n, n, a_buffer, a_offset, a_ld); + TestVectorX(n, b_buffer, b_offset, b_inc); + + // Creates a copy of B to avoid overwriting input while computing output + // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels + const auto x_offset = b_offset; + const auto x_inc = b_inc; + const auto x_size = n*x_inc + x_offset; + auto x_buffer = Buffer<T>(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector<Event>(); + auto fill_vector_event = Event(); + FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList, + n, x_inc, x_offset, x_buffer, ConstantZero<T>()); + fill_vector_event.WaitForCompletion(); + + // Derives properties based on the arguments + const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + const auto is_transposed = ((layout == Layout::kColMajor && a_transpose == Transpose::kNo) || + (layout != Layout::kColMajor && a_transpose != Transpose::kNo)); + + // Loops over the blocks + auto col = n; // the initial column position + for (auto i = size_t{0}; i < n; i += db_["TRSV_BLOCK_SIZE"]) { + const auto block_size = std::min(db_["TRSV_BLOCK_SIZE"], n - i); + + // Sets the next column position + col = (is_upper) ? col - block_size : i; + + // Sets the offsets for upper or lower triangular + const auto extra_offset_a = (is_transposed) ? + (is_upper ? col + (col+block_size)*a_ld : col) : + (is_upper ? col+block_size + col*a_ld : col*a_ld); + const auto extra_offset_x = (is_upper) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + const auto gemv_m = (a_transpose == Transpose::kNo) ? block_size : i; + const auto gemv_n = (a_transpose == Transpose::kNo) ? i : block_size; + DoGemv(layout, a_transpose, gemv_m, gemv_n, ConstantOne<T>(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + + // Runs the triangular substitution for the block size + Substitution(layout, triangle, a_transpose, diagonal, block_size, + a_buffer, a_offset + col + col*a_ld, a_ld, + b_buffer, b_offset + col*b_inc, b_inc, + x_buffer, x_offset + col*x_inc, x_inc); + } + + // Retrieves the results + x_buffer.CopyTo(queue_, x_size, b_buffer); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrsv<half>; +template class Xtrsv<float>; +template class Xtrsv<double>; +template class Xtrsv<float2>; +template class Xtrsv<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp new file mode 100644 index 00000000..67e626a1 --- /dev/null +++ b/src/routines/level2/xtrsv.hpp @@ -0,0 +1,60 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular +// forward and backward substitutions on the diagonal parts of the matrix in combination with larger +// GEMV computation on the remainder of the matrix. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRSV_H_ +#define CLBLAST_ROUTINES_XTRSV_H_ + +#include "routines/level2/xgemv.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xtrsv: public Xgemv<T> { + public: + + // Uses the generic matrix-vector routine + using Xgemv<T>::queue_; + using Xgemv<T>::context_; + using Xgemv<T>::device_; + using Xgemv<T>::db_; + using Xgemv<T>::program_; + using Xgemv<T>::DoGemv; + + // Constructor + Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV"); + + // Templated-precision implementation of the routine + void DoTrsv(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + + // Performs forward or backward substitution on a small triangular matrix + void Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRSV_H_ +#endif diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp new file mode 100644 index 00000000..3a910261 --- /dev/null +++ b/src/routines/level3/xtrsm.cpp @@ -0,0 +1,202 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the triangular matrix solver (A * X = B) TRSM class. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/level3/xtrsm.hpp" +#include "routines/levelx/xinvert.hpp" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xtrsm<T>::Xtrsm(Queue &queue, EventPointer event, const std::string &name): + Xgemm<T>(queue, event, name) { +} + + +// ================================================================================================= + +// The main routine +template <typename T> +void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { + + // Settings + constexpr auto block_size = size_t{32}; // tuneable + + // Makes sure all dimensions are larger than zero + if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } + + // Computes the k dimension. This is based on whether or not matrix is A (on the left) + // or B (on the right) in the Xgemm routine. + const auto k = (side == Side::kLeft) ? m : n; + + // Checks for validity of the triangular A matrix + TestMatrixA(k, k, a_buffer, a_offset, a_ld); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + + // Checks for validity of the input B matrix + const auto b_one = (layout == Layout::kRowMajor) ? n : m; + const auto b_two = (layout == Layout::kRowMajor) ? m : n; + TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); + + // Creates a copy of B to avoid overwriting input in GEMM while computing output + const auto b_size = b_ld * (b_two - 1) + b_one + b_offset; + const auto x_one = b_one; + const auto x_size = b_size; + const auto x_ld = b_ld; + const auto x_offset = b_offset; + auto x_buffer = Buffer<T>(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Temporary buffer for the inverse of the A matrix + const auto a_inv_size = Ceil(k, block_size) * block_size; + auto a_inv_buffer = Buffer<T>(context_, a_inv_size); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector<Event>(); + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, + x_one, x_ld, x_offset, x_buffer, ConstantZero<T>()); + fill_matrix_event.WaitForCompletion(); + + // Inverts the diagonal blocks + auto diagonal_invert_event = Event(); + auto inverter = Xinvert<T>(queue_, diagonal_invert_event.pointer()); + inverter.InvertMatrixDiagonalBlocks(layout, triangle, diagonal, + k, block_size, a_buffer, a_offset, a_ld, a_inv_buffer); + diagonal_invert_event.WaitForCompletion(); + + // Lower of upper triangular + const bool condition = ((triangle == Triangle::kUpper && a_transpose != Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose == Transpose::kNo)); + + // Left side + if (side == Side::kLeft) { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + for (auto i = size_t{0}; i < m; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); + const auto current_block_size = std::min(m - i, block_size); + DoGemm(layout, a_transpose, Transpose::kNo, + current_block_size, n, current_block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero<T>(), + x_buffer, i, x_ld); + if (i + block_size >= m) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? (i + block_size) + i * a_ld : i + (block_size + i) * a_ld; + DoGemm(layout, a_transpose, Transpose::kNo, + m - i - block_size, n, block_size, ConstantNegOne<T>(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne<T>(), + b_buffer, i + block_size, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + const auto current_block_size = (m % block_size == 0) ? block_size : (m % block_size); + const auto i_start = static_cast<int>(m) - static_cast<int>(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); + DoGemm(layout, a_transpose, Transpose::kNo, + block_size, n, block_size, gemm_alpha, + a_inv_buffer, i * block_size, block_size, + b_buffer, i, b_ld, ConstantZero<T>(), + x_buffer, i, x_ld); + if (i - static_cast<int>(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i * a_ld : i; + DoGemm(layout, a_transpose, Transpose::kNo, + i, n, block_size, ConstantNegOne<T>(), + a_buffer, this_a_offset, a_ld, + x_buffer, i, x_ld, ConstantOne<T>(), + b_buffer, 0, b_ld); + } + } + } + + // Right side + else { + + // True when (lower triangular) or (upper triangular and transposed) + if (condition) { + const auto current_block_size = (n % block_size == 0) ? block_size : (n % block_size); + const auto i_start = static_cast<int>(n) - static_cast<int>(current_block_size); + for (auto i = i_start; i >= 0; i -= static_cast<int>(block_size)) { + const auto gemm_alpha = (i == i_start) ? alpha : ConstantOne<T>(); + DoGemm(layout, Transpose::kNo, a_transpose, + m, block_size, block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), + x_buffer, i * x_ld, x_ld); + if (i - static_cast<int>(block_size) < 0) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i : i * a_ld; + DoGemm(layout, Transpose::kNo, a_transpose, + m, i, block_size, ConstantNegOne<T>(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne<T>(), + b_buffer, 0, b_ld); + } + } + + // True when (upper triangular) or (lower triangular and transposed) + else { + for (auto i = size_t{0}; i < n; i += block_size) { + const auto gemm_alpha = (i == 0) ? alpha : ConstantOne<T>(); + const auto current_block_size = std::min(n - i, block_size); + DoGemm(layout, Transpose::kNo, a_transpose, + m, current_block_size, current_block_size, gemm_alpha, + b_buffer, i * b_ld, b_ld, + a_inv_buffer, i * block_size, block_size, ConstantZero<T>(), + x_buffer, i * x_ld, x_ld); + if (i + block_size >= n) { break; } + const auto this_a_offset = (a_transpose == Transpose::kNo) ? i + (block_size + i) * a_ld : (i + block_size) + i * a_ld; + DoGemm(layout, Transpose::kNo, a_transpose, + m, n - i - block_size, block_size, ConstantNegOne<T>(), + x_buffer, i * x_ld, x_ld, + a_buffer, this_a_offset, a_ld, ConstantOne<T>(), + b_buffer, (i + block_size) * b_ld, b_ld); + } + } + } + + // Retrieves the results + x_buffer.CopyTo(queue_, b_size, b_buffer); +} + +// ================================================================================================= + +// Compiles the templated class +template class Xtrsm<half>; +template class Xtrsm<float>; +template class Xtrsm<double>; +template class Xtrsm<float2>; +template class Xtrsm<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp new file mode 100644 index 00000000..b9d5432a --- /dev/null +++ b/src/routines/level3/xtrsm.hpp @@ -0,0 +1,52 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the Xtrsm routine. The implementation is based on ??? (TODO). +// Therefore, this class inherits from the Xgemm class. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XTRSM_H_ +#define CLBLAST_ROUTINES_XTRSM_H_ + +#include "routines/level3/xgemm.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class Xtrsm: public Xgemm<T> { + public: + + // Uses methods and variables the Xgemm routine + using Xgemm<T>::queue_; + using Xgemm<T>::context_; + using Xgemm<T>::device_; + using Xgemm<T>::db_; + using Xgemm<T>::program_; + using Xgemm<T>::DoGemm; + + // Constructor + Xtrsm(Queue &queue, EventPointer event, const std::string &name = "TRSM"); + + // Templated-precision implementation of the routine + void DoTrsm(const Layout layout, const Side side, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XTRSM_H_ +#endif diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..696e694a --- /dev/null +++ b/src/routines/levelx/xinvert.cpp @@ -0,0 +1,151 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the common code to perform (partial) matrix inverting. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/levelx/xinvert.hpp" + +#include <string> +#include <vector> +#include <assert.h> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, { + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/invert_diagonal_blocks.opencl" + }) { +} + +// ================================================================================================= + +// Inverts diagonal square blocks of a matrix +template <typename T> +void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer<T> &src, const size_t offset, const size_t ld_src, + Buffer<T> &dest) { + + // Makes sure all dimensions are larger than zero and the block size is smaller than n + if ((block_size == 0) || (n == 0) || (block_size > n)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Helper variables + const auto internal_block_size = static_cast<size_t>(db_["INTERNAL_BLOCK_SIZE"]); + assert(internal_block_size == 16); + const auto num_blocks = CeilDiv(n, block_size); + const auto num_internal_blocks = CeilDiv(n, internal_block_size); + const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false; + + // This routine only supports block sizes which are a multiple of the internal block size and + // block sizes up to and including 128 + if ((block_size % internal_block_size != 0) || (block_size > 128)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Checks for validity of the source and destination matrices + TestMatrixA(n, n, src, offset, ld_src); + TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto name_postfix = (is_upper) ? "Upper" : "Lower"; + + // Fills the output buffer with zeros + auto event_wait_list = std::vector<Event>(); + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list, + num_blocks * block_size, block_size, 0, dest, ConstantZero<T>()); + event_wait_list.push_back(fill_matrix_event); + + // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group + auto kernel = Kernel(program_, "InvertDiagonalBlock"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, static_cast<int>(ld_src)); + kernel.SetArgument(4, dest()); + kernel.SetArgument(5, static_cast<int>(block_size)); + kernel.SetArgument(6, static_cast<int>(unit_diagonal)); + kernel.SetArgument(7, static_cast<int>(is_upper)); + const auto local = std::vector<size_t>{internal_block_size}; + const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size}; + auto base_kernel_event = Event(); + auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer(); + RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list); + if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); } + + // Builds up block_size x block_size blocks. For example, internal_block_size=16: + // use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads; + // then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads; + // then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads; + for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) { + assert(current_size == 16 || current_size == 32 || current_size == 64); + + // Emulates a 3D grid: NX * (NY * npages) + const auto npages = CeilDiv(n, current_size*2); + const auto local0 = (current_size <= 32) ? current_size/4 : 16; + const auto local = std::vector<size_t>{local0, 4}; + const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]}; + + // Part 1 + auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + kernel1.SetArgument(0, static_cast<int>(n)); + kernel1.SetArgument(1, src()); + kernel1.SetArgument(2, static_cast<int>(offset)); + kernel1.SetArgument(3, static_cast<int>(ld_src)); + kernel1.SetArgument(4, dest()); + kernel1.SetArgument(5, static_cast<int>(current_size)); + kernel1.SetArgument(6, static_cast<int>(npages)); + kernel1.SetArgument(7, static_cast<int>(block_size)); + auto kernel1_event = Event(); + RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list); + event_wait_list.push_back(kernel1_event); + + // Part 2 + const bool is_last_kernel = (current_size * 2 >= block_size); + auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + kernel2.SetArgument(0, static_cast<int>(n)); + kernel2.SetArgument(1, dest()); + kernel2.SetArgument(2, static_cast<int>(current_size)); + kernel2.SetArgument(3, static_cast<int>(npages)); + kernel2.SetArgument(4, static_cast<int>(block_size)); + auto kernel2_event = Event(); + auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer(); + RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list); + if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); } + + // Exit in case we reach beyond the bounds of the input matrix + if (current_size*2 >= n) { break; } + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xinvert<half>; +template class Xinvert<float>; +template class Xinvert<double>; +template class Xinvert<float2>; +template class Xinvert<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..fa0a80e7 --- /dev/null +++ b/src/routines/levelx/xinvert.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the common code to perform (partial) matrix inverting. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XINVERT_H_ +#define CLBLAST_ROUTINES_XINVERT_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +template <typename T> +class Xinvert: public Routine { + public: + + // Constructor + Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT"); + + // Inverts diagonal square blocks of a matrix + void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer<T> &src, const size_t offset, const size_t ld_src, + Buffer<T> &dest); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XINVERT_H_ +#endif |