From 4b3ffd998904f5c848edc5917308f5942fa71da3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 15 Jan 2017 17:30:00 +0100 Subject: Added a first version of the diagonal block invert routine in preparation of TRSM --- src/routines/common.hpp | 21 ++++++ src/routines/levelx/xinvert.cpp | 152 ++++++++++++++++++++++++++++++++++++++++ src/routines/levelx/xinvert.hpp | 40 +++++++++++ 3 files changed, 213 insertions(+) create mode 100644 src/routines/levelx/xinvert.cpp create mode 100644 src/routines/levelx/xinvert.hpp (limited to 'src/routines') diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 53ca6355..47f98de0 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -33,6 +33,27 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, // ================================================================================================= +// Sets all elements of a matrix to a constant value +template +void FillMatrix(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector &waitForEvents, + const size_t n, const size_t ld, const size_t offset, + const Buffer &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillMatrix"); + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, static_cast(ld)); + kernel.SetArgument(2, static_cast(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector{8, 8}; + auto global = std::vector{Ceil(ld, 8), Ceil(n, 8)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + +// ================================================================================================= + // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able // to write to symmetric and triangular matrices through optional arguments. template diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp new file mode 100644 index 00000000..5ffba958 --- /dev/null +++ b/src/routines/levelx/xinvert.cpp @@ -0,0 +1,152 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the common code to perform (partial) matrix inverting. This code is based +// on the TRSM implementation in the CUDA version of Magma version 2.2.0 and the poster "Triangular +// Linear System Solver for GPU with CUDA and OpenCL" by Peng Du, Stanimire Tomov, Piotr Luszczek, +// and Jack Dongarra. +// +// ================================================================================================= + +#include "routines/levelx/xinvert.hpp" + +#include +#include +#include + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template +Xinvert::Xinvert(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Invert"}, PrecisionValue(), {}, { + #include "../../kernels/level3/invert_diagonal_blocks.opencl" + }) { +} + +// ================================================================================================= + +// Inverts diagonal square blocks of a matrix +template +void Xinvert::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer &src, const size_t offset, const size_t ld_src, + Buffer &dest) { + + // Makes sure all dimensions are larger than zero and the block size is smaller than n + if ((block_size == 0) || (n == 0) || (block_size > n)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Helper variables + const auto internal_block_size = static_cast(db_["INTERNAL_BLOCK_SIZE"]); + assert(internal_block_size == 16); + const auto num_blocks = CeilDiv(n, block_size); + const auto num_internal_blocks = CeilDiv(n, internal_block_size); + const auto unit_diagonal = (diag == Diagonal::kUnit) ? true : false; + + // This routine only supports block sizes which are a multiple of the internal block size and + // block sizes up to and including 128 + if ((block_size % internal_block_size != 0) || (block_size > 128)) { + throw BLASError(StatusCode::kInvalidDimension); + } + + // Checks for validity of the source and destination matrices + TestMatrixA(n, n, src, offset, ld_src); + TestMatrixB(block_size, num_blocks * block_size, dest, 0, block_size); + + // Determines which kernels to run based on the layout (the kernels assume column-major as + // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix + const bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || + (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + const auto name_postfix = (is_upper) ? "Upper" : "Lower"; + + // Retrieves the program from the cache + auto event_wait_list = std::vector(); + const auto program = GetProgramFromCache(context_, PrecisionValue(), "INVERT"); + + // Fills the output buffer with zeros + auto fill_matrix_event = Event(); + FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list, + num_blocks * block_size, block_size, 0, dest, ConstantZero()); + event_wait_list.push_back(fill_matrix_event); + + // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group + auto kernel = Kernel(program, "InvertDiagonalBlock"); + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, src()); + kernel.SetArgument(2, static_cast(offset)); + kernel.SetArgument(3, static_cast(ld_src)); + kernel.SetArgument(4, dest()); + kernel.SetArgument(5, static_cast(block_size)); + kernel.SetArgument(6, static_cast(unit_diagonal)); + kernel.SetArgument(7, static_cast(is_upper)); + const auto local = std::vector{internal_block_size}; + const auto global = std::vector{num_internal_blocks * internal_block_size}; + auto base_kernel_event = Event(); + RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list); + event_wait_list.push_back(base_kernel_event); + + // Builds up block_size x block_size blocks. For example, internal_block_size=16: + // use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads; + // then 32 x 32 blocks to build 64 x 64 blocks, 1 x (2 x npages) grid, 8 x 4 threads; + // then 64 x 64 blocks to build 128 x 128 blocks, 1 x (4 x npages) grid, 16 x 4 threads; + for (auto current_size = internal_block_size; current_size < block_size; current_size *= 2) { + assert(current_size == 16 || current_size == 32 || current_size == 64); + + // Emulates a 3D grid: NX * (NY * npages) + const auto npages = CeilDiv(n, current_size*2); + const auto local0 = (current_size <= 32) ? current_size/4 : 16; + const auto local = std::vector{local0, 4}; + const auto global = std::vector{(current_size/local[1]), npages*(current_size/16)*local[1]}; + + // Part 1 + auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + kernel1.SetArgument(0, static_cast(n)); + kernel1.SetArgument(1, src()); + kernel1.SetArgument(2, static_cast(offset)); + kernel1.SetArgument(3, static_cast(ld_src)); + kernel1.SetArgument(4, dest()); + kernel1.SetArgument(5, static_cast(current_size)); + kernel1.SetArgument(6, static_cast(npages)); + kernel1.SetArgument(7, static_cast(block_size)); + auto kernel1_event = Event(); + RunKernel(kernel1, queue_, device_, global, local, kernel1_event.pointer(), event_wait_list); + event_wait_list.push_back(kernel1_event); + + // Part 2 + const bool is_last_kernel = (current_size * 2 >= block_size); + auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + kernel2.SetArgument(0, static_cast(n)); + kernel2.SetArgument(1, dest()); + kernel2.SetArgument(2, static_cast(current_size)); + kernel2.SetArgument(3, static_cast(npages)); + kernel2.SetArgument(4, static_cast(block_size)); + auto kernel2_event = Event(); + auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer(); + RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list); + if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); } + + // Exit in case we reach beyond the bounds of the input matrix + if (current_size*2 >= n) { break; } + } +} + +// ================================================================================================= + +// Compiles the templated class +template class Xinvert; +template class Xinvert; +template class Xinvert; +template class Xinvert; +template class Xinvert; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xinvert.hpp b/src/routines/levelx/xinvert.hpp new file mode 100644 index 00000000..fa0a80e7 --- /dev/null +++ b/src/routines/levelx/xinvert.hpp @@ -0,0 +1,40 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains all the common code to perform (partial) matrix inverting. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XINVERT_H_ +#define CLBLAST_ROUTINES_XINVERT_H_ + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +template +class Xinvert: public Routine { + public: + + // Constructor + Xinvert(Queue &queue, EventPointer event, const std::string &name = "INVERT"); + + // Inverts diagonal square blocks of a matrix + void InvertMatrixDiagonalBlocks(const Layout layout, const Triangle triangle, const Diagonal diag, + const size_t n, const size_t block_size, + const Buffer &src, const size_t offset, const size_t ld_src, + Buffer &dest); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XINVERT_H_ +#endif -- cgit v1.2.3