diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-01-29 17:02:00 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-01-29 17:02:00 +0100 |
commit | 7c73ceb095f22099067b8a880961cdcbee4257ea (patch) | |
tree | d206310b5d918a7de023776aec98222fc24bbfd4 | |
parent | a5fd2323b6d9ce793f12618951012fcfec257b95 (diff) |
Added first (incomplete) version of TRSV routine
-rw-r--r-- | src/kernels/level2/xtrsv.opencl | 94 | ||||
-rw-r--r-- | src/routines/common.hpp | 19 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cpp | 1 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.cpp | 131 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.hpp | 17 |
5 files changed, 251 insertions, 11 deletions
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl new file mode 100644 index 00000000..00f29e47 --- /dev/null +++ b/src/kernels/level2/xtrsv.opencl @@ -0,0 +1,94 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains kernels to perform forward or backward substition, as used in the TRSV routine +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRSV) + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) +void FillVector(const int n, const int inc, const int offset, + __global real* restrict dest, const real_arg arg_value) { + const real value = GetRealArg(arg_value); + const int tid = get_global_id(0); + if (tid < n) { + dest[tid*inc + offset] = value; + } +} + +// ================================================================================================= + +// TODO: Put variable in database +#define TRSV_BLOCK_SIZE 256 + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void trsv_forward(int n, + const __global float *A, const int a_offset, int lda, + __global float *b, const int b_offset, int b_inc, + __global float *x, const int x_offset, int x_inc, + const int is_transposed, const int is_unit_diagonal) { + __local float sx[TRSV_BLOCK_SIZE]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = 0; i < n; ++i) { + real sum = b[i*b_inc + b_offset]; + for (int j = 0; j < i; ++j) { + real a_value; + if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; } + else { a_value = A[j + i*lda + a_offset]; } + sum -= a_value * sx[j]; + } + sum -= x[i*x_inc + x_offset]; + if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } + sx[i] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + } + for (int i = 0; i < n; ++i) { + x[i*x_inc + x_offset] = sx[i]; + } +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void trsv_backward(int n, + const __global float *A, const int a_offset, int lda, + __global float *b, const int b_offset, int b_inc, + __global float *x, const int x_offset, int x_inc, + const int is_trans, const int is_unit_diagonal) { + __local float sx[TRSV_BLOCK_SIZE]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int i = n - 1; i >= 0; --i) { + real sum = b[i*b_inc + b_offset]; + for (int j = i + 1; j < n; ++j) { + real a_value; + if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; } + else { a_value = A[j + i*lda + a_offset]; } + sum -= a_value * sx[j]; + } + sum -= x[i*x_inc + x_offset]; + if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } + sx[i] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + } + for (int i = 0; i < n; ++i) { + x[i*x_inc + x_offset] = sx[i]; + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 47f98de0..8046c0be 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -52,6 +52,25 @@ void FillMatrix(Queue &queue, const Device &device, RunKernel(kernel, queue, device, global, local, event, waitForEvents); } +// Sets all elements of a vector to a constant value +template <typename T> +void FillVector(Queue &queue, const Device &device, + const Program &program, const Database &, + EventPointer event, const std::vector<Event> &waitForEvents, + const size_t n, const size_t inc, const size_t offset, + const Buffer<T> &dest, + const T constant_value) { + auto kernel = Kernel(program, "FillVector"); + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, static_cast<int>(inc)); + kernel.SetArgument(2, static_cast<int>(offset)); + kernel.SetArgument(3, dest()); + kernel.SetArgument(4, GetRealArg(constant_value)); + auto local = std::vector<size_t>{64}; + auto global = std::vector<size_t>{Ceil(n, 64)}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); +} + // ================================================================================================= // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 7b4c2e8f..26eed5e0 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -25,6 +25,7 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, { #include "../../kernels/level2/xgemv.opencl" #include "../../kernels/level2/xgemv_fast.opencl" + #include "../../kernels/level2/xtrsv.opencl" }) { } diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index d5d5a7ca..93f4174f 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -25,6 +25,59 @@ Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name): Xgemv<T>(queue, event, name) { } +// TODO: Temporary variable, put in database instead +constexpr size_t TRSV_BLOCK_SIZE = 9; + +// ================================================================================================= + +template <typename T> +void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + + if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); }; + + // Retrieves the program from the cache + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); + + // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel + const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; + + // The data is either in the upper or lower triangle + auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || + (triangle == Triangle::kLower && a_transpose != Transpose::kNo)); + + // Retrieves the kernel from the compiled binary + const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, a_buffer()); + kernel.SetArgument(2, static_cast<int>(a_offset)); + kernel.SetArgument(3, static_cast<int>(a_ld)); + kernel.SetArgument(4, b_buffer()); + kernel.SetArgument(5, static_cast<int>(b_offset)); + kernel.SetArgument(6, static_cast<int>(b_inc)); + kernel.SetArgument(7, x_buffer()); + kernel.SetArgument(8, static_cast<int>(x_offset)); + kernel.SetArgument(9, static_cast<int>(x_inc)); + kernel.SetArgument(10, static_cast<int>(is_transposed)); + kernel.SetArgument(11, static_cast<int>(is_unit_diagonal)); + + // Launches the kernel + const auto local = std::vector<size_t>{1}; + const auto global = std::vector<size_t>{1}; + auto event = Event(); + RunKernel(kernel, queue_, device_, global, local, event.pointer()); + event.WaitForCompletion(); +} + // ================================================================================================= // The main routine @@ -33,24 +86,84 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) { + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } // Tests the matrix and vector TestMatrixA(n, n, a_buffer, a_offset, a_ld); - TestVectorX(n, x_buffer, x_offset, x_inc); + TestVectorX(n, b_buffer, b_offset, b_inc); - // Creates a copy of X: a temporary scratch buffer - auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset); - x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer); + // Retrieves the program from the cache + const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); - // The data is either in the upper or lower triangle - size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || - (triangle == Triangle::kLower && layout == Layout::kRowMajor)); + // Creates a copy of B to avoid overwriting input while computing output + // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels + const auto x_offset = b_offset; + const auto x_inc = b_inc; + const auto x_size = n*x_inc + x_offset; + auto x_buffer = Buffer<T>(context_, x_size); + b_buffer.CopyTo(queue_, x_size, x_buffer); + + // Fills the output buffer with zeros + auto eventWaitList = std::vector<Event>(); + auto fill_vector_event = Event(); + FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList, + n, x_inc, x_offset, x_buffer, ConstantZero<T>()); + fill_vector_event.WaitForCompletion(); + + // TODO: Not working for row-major at the moment + const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) || + (a_transpose != Transpose::kNo && layout != Layout::kRowMajor)); + + // Loops over the blocks + auto col = n; // the initial column position + for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) { + const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i); + + if (!is_transposed) { + + // Sets the offsets for upper or lower triangular + col = (triangle == Triangle::kUpper) ? col - block_size : i; + const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col; + const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + DoGemv(layout, a_transpose, block_size, i, ConstantOne<T>(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + } + else { + + // Sets the offsets for upper or lower triangular + col = (triangle == Triangle::kLower) ? col - block_size : i; + const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld; + const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0; + const auto extra_offset_b = col*x_inc; + + // Runs the GEMV routine to compute x' = A * x + if (i > 0) { + DoGemv(layout, a_transpose, i, block_size, ConstantOne<T>(), + a_buffer, a_offset + extra_offset_a, a_ld, + x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(), + x_buffer, x_offset + extra_offset_b, x_inc ); + } + } + + // Runs the triangular substitution for the block size + Substitution(layout, triangle, a_transpose, diagonal, block_size, + a_buffer, a_offset + col + col*a_ld, a_ld, + b_buffer, b_offset + col*b_inc, b_inc, + x_buffer, x_offset + col*x_inc, x_inc); + } - // TODO: Implement the routine + // Retrieves the results + x_buffer.CopyTo(queue_, x_size, b_buffer); } // ================================================================================================= diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp index 4a73b5eb..dc3f32f0 100644 --- a/src/routines/level2/xtrsv.hpp +++ b/src/routines/level2/xtrsv.hpp @@ -7,7 +7,9 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file implements the Xtrsv routine. +// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular +// forward and backward substitutions on the diagonal parts of the matrix in combination with larger +// GEMV computation on the remainder of the matrix. // // ================================================================================================= @@ -25,9 +27,12 @@ class Xtrsv: public Xgemv<T> { public: // Uses the generic matrix-vector routine + using Xgemv<T>::routine_name_; using Xgemv<T>::queue_; using Xgemv<T>::context_; - using Xgemv<T>::MatVec; + using Xgemv<T>::device_; + using Xgemv<T>::db_; + using Xgemv<T>::DoGemv; // Constructor Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV"); @@ -38,6 +43,14 @@ class Xtrsv: public Xgemv<T> { const size_t n, const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc); + + // Performs forward or backward substitution on a small triangular matrix + void Substitution(const Layout layout, const Triangle triangle, + const Transpose a_transpose, const Diagonal diagonal, + const size_t n, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc, + const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc); }; // ================================================================================================= |