From 7c73ceb095f22099067b8a880961cdcbee4257ea Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 29 Jan 2017 17:02:00 +0100 Subject: Added first (incomplete) version of TRSV routine --- src/kernels/level2/xtrsv.opencl | 94 ++++++++++++++++++++++++++++ src/routines/common.hpp | 19 ++++++ src/routines/level2/xgemv.cpp | 1 + src/routines/level2/xtrsv.cpp | 131 +++++++++++++++++++++++++++++++++++++--- src/routines/level2/xtrsv.hpp | 17 +++++- 5 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 src/kernels/level2/xtrsv.opencl diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl new file mode 100644 index 00000000..00f29e47 --- /dev/null +++ b/src/kernels/level2/xtrsv.opencl @@ -0,0 +1,94 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains kernels to perform forward or backward substition, as used in the TRSV routine +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRSV) + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void FillVector(const int n, const int inc, const int offset, + __global real* restrict dest, const real_arg arg_value) { + const real value = GetRealArg(arg_value); + const int tid = get_global_id(0); + if (tid < n) { + dest[tid*inc + offset] = value; + } +} + +// ================================================================================================= + +// TODO: Put variable in database +#define TRSV_BLOCK_SIZE 256 + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void trsv_forward(int n, + const __global float *A, const int a_offset, int lda, + __global float *b, const int b_offset, int b_inc, + __global float *x, const int x_offset, int x_inc, + const int is_transposed, const int is_unit_diagonal) { + __local float sx[TRSV_BLOCK_SIZE]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = 0; i < n; ++i) { + real sum = b[i*b_inc + b_offset]; + for (int j = 0; j < i; ++j) { + real a_value; + if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; } + else { a_value = A[j + i*lda + a_offset]; } + sum -= a_value * sx[j]; + } + sum -= x[i*x_inc + x_offset]; + if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } + sx[i] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + } + for (int i = 0; i < n; ++i) { + x[i*x_inc + x_offset] = sx[i]; + } +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void trsv_backward(int n, + const __global float *A, const int a_offset, int lda, + __global float *b, const int b_offset, int b_inc, + __global float *x, const int x_offset, int x_inc, + const int is_trans, const int is_unit_diagonal) { + __local float sx[TRSV_BLOCK_SIZE]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = n - 1; i >= 0; --i) { + real sum = b[i*b_inc + b_offset]; + for (int j = i + 1; j < n; ++j) { + real a_value; + if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; } + else { a_value = A[j + i*lda + a_offset]; } + sum -= a_value * sx[j]; + } + sum -= x[i*x_inc + x_offset]; + if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } + sx[i] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + } + for (int i = 0; i < n; ++i) { + x[i*x_inc + x_offset] = sx[i]; + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 47f98de0..8046c0be 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -52,6 +52,25 @@ void FillMatrix(Queue &queue, const Device &device, RunKernel(kernel, queue, device, global, local, event, waitForEvents); } +// Sets all elements of a vector to a constant value +template +void FillVector(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector &waitForEvents, + const size_t n, const size_t inc, const size_t offset, + const Buffer &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillVector"); + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, static_cast(inc)); + kernel.SetArgument(2, static_cast(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector{64}; + auto global = std::vector{Ceil(n, 64)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 7b4c2e8f..26eed5e0 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -25,6 +25,7 @@ Xgemv::Xgemv(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" + #include "../../kernels/level2/xtrsv.opencl" }) { } diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index d5d5a7ca..93f4174f 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -25,6 +25,59 @@ Xtrsv::Xtrsv(Queue &queue, EventPointer event, const std::string &name): Xgemv(queue, event, name) { } +// TODO: Temporary variable, put in database instead +constexpr size_t TRSV_BLOCK_SIZE = 9; + +// ================================================================================================= + +template +void Xtrsv::Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + + if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); }; + + // Retrieves the program from the cache + const auto program = GetProgramFromCache(context_, PrecisionValue(), "TRSV"); + + // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel + const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; + + // The data is either in the upper or lower triangle + auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + + // Retrieves the kernel from the compiled binary + const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(n)); + kernel.SetArgument(1, a_buffer()); + kernel.SetArgument(2, static_cast(a_offset)); + kernel.SetArgument(3, static_cast(a_ld)); + kernel.SetArgument(4, b_buffer()); + kernel.SetArgument(5, static_cast(b_offset)); + kernel.SetArgument(6, static_cast(b_inc)); + kernel.SetArgument(7, x_buffer()); + kernel.SetArgument(8, static_cast(x_offset)); + kernel.SetArgument(9, static_cast(x_inc)); + kernel.SetArgument(10, static_cast(is_transposed)); + kernel.SetArgument(11, static_cast(is_unit_diagonal)); + + // Launches the kernel + const auto local = std::vector{1}; + const auto global = std::vector{1}; + auto event = Event(); + RunKernel(kernel, queue_, device_, global, local, event.pointer()); + event.WaitForCompletion(); +} + // ================================================================================================= // The main routine @@ -33,24 +86,84 @@ void Xtrsv::DoTrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc) { + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the matrix and vector TestMatrixA(n, n, a_buffer, a_offset, a_ld); - TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorX(n, b_buffer, b_offset, b_inc); - // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer(context_, n*x_inc + x_offset); - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + // Retrieves the program from the cache + const auto program = GetProgramFromCache(context_, PrecisionValue(), "TRSV"); - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + // Creates a copy of B to avoid overwriting input while computing output + // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels + const auto x_offset = b_offset; + const auto x_inc = b_inc; + const auto x_size = n*x_inc + x_offset; + auto x_buffer = Buffer(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector(); + auto fill_vector_event = Event(); + FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList, + n, x_inc, x_offset, x_buffer, ConstantZero()); + fill_vector_event.WaitForCompletion(); + + // TODO: Not working for row-major at the moment + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kRowMajor)); + + // Loops over the blocks + auto col = n; // the initial column position + for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) { + const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i); + + if (!is_transposed) { + + // Sets the offsets for upper or lower triangular + col = (triangle == Triangle::kUpper) ? col - block_size : i; + const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col; + const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + DoGemv(layout, a_transpose, block_size, i, ConstantOne(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + } + else { + + // Sets the offsets for upper or lower triangular + col = (triangle == Triangle::kLower) ? col - block_size : i; + const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld; + const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + DoGemv(layout, a_transpose, i, block_size, ConstantOne(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + } + + // Runs the triangular substitution for the block size + Substitution(layout, triangle, a_transpose, diagonal, block_size, + a_buffer, a_offset + col + col*a_ld, a_ld, + b_buffer, b_offset + col*b_inc, b_inc, + x_buffer, x_offset + col*x_inc, x_inc); + } - // TODO: Implement the routine + // Retrieves the results + x_buffer.CopyTo(queue_, x_size, b_buffer); } // ================================================================================================= diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp index 4a73b5eb..dc3f32f0 100644 --- a/src/routines/level2/xtrsv.hpp +++ b/src/routines/level2/xtrsv.hpp @@ -7,7 +7,9 @@ // Author(s): // Cedric Nugteren // -// This file implements the Xtrsv routine. +// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular +// forward and backward substitutions on the diagonal parts of the matrix in combination with larger +// GEMV computation on the remainder of the matrix. // // ================================================================================================= @@ -25,9 +27,12 @@ class Xtrsv: public Xgemv { public: // Uses the generic matrix-vector routine + using Xgemv::routine_name_; using Xgemv::queue_; using Xgemv::context_; - using Xgemv::MatVec; + using Xgemv::device_; + using Xgemv::db_; + using Xgemv::DoGemv; // Constructor Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV"); @@ -38,6 +43,14 @@ class Xtrsv: public Xgemv { const size_t n, const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer &x_buffer, const size_t x_offset, const size_t x_inc); + + // Performs forward or backward substitution on a small triangular matrix + void Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer &x_buffer, const size_t offset_x, const size_t x_inc); }; // ================================================================================================= -- cgit v1.2.3