summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-01-29 17:02:00 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-01-29 17:02:00 +0100
commit7c73ceb095f22099067b8a880961cdcbee4257ea (patch)
treed206310b5d918a7de023776aec98222fc24bbfd4
parenta5fd2323b6d9ce793f12618951012fcfec257b95 (diff)
Added first (incomplete) version of TRSV routine
-rw-r--r--src/kernels/level2/xtrsv.opencl94
-rw-r--r--src/routines/common.hpp19
-rw-r--r--src/routines/level2/xgemv.cpp1
-rw-r--r--src/routines/level2/xtrsv.cpp131
-rw-r--r--src/routines/level2/xtrsv.hpp17
5 files changed, 251 insertions, 11 deletions
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..00f29e47
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+ __global real* restrict dest, const real_arg arg_value) {
+ const real value = GetRealArg(arg_value);
+ const int tid = get_global_id(0);
+ if (tid < n) {
+ dest[tid*inc + offset] = value;
+ }
+}
+
+// =================================================================================================
+
+// TODO: Put variable in database
+#define TRSV_BLOCK_SIZE 256
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_forward(int n,
+ const __global float *A, const int a_offset, int lda,
+ __global float *b, const int b_offset, int b_inc,
+ __global float *x, const int x_offset, int x_inc,
+ const int is_transposed, const int is_unit_diagonal) {
+ __local float sx[TRSV_BLOCK_SIZE];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int i = 0; i < n; ++i) {
+ real sum = b[i*b_inc + b_offset];
+ for (int j = 0; j < i; ++j) {
+ real a_value;
+ if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
+ else { a_value = A[j + i*lda + a_offset]; }
+ sum -= a_value * sx[j];
+ }
+ sum -= x[i*x_inc + x_offset];
+ if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+ sx[i] = sum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ for (int i = 0; i < n; ++i) {
+ x[i*x_inc + x_offset] = sx[i];
+ }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_backward(int n,
+ const __global float *A, const int a_offset, int lda,
+ __global float *b, const int b_offset, int b_inc,
+ __global float *x, const int x_offset, int x_inc,
+ const int is_trans, const int is_unit_diagonal) {
+ __local float sx[TRSV_BLOCK_SIZE];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int i = n - 1; i >= 0; --i) {
+ real sum = b[i*b_inc + b_offset];
+ for (int j = i + 1; j < n; ++j) {
+ real a_value;
+ if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
+ else { a_value = A[j + i*lda + a_offset]; }
+ sum -= a_value * sx[j];
+ }
+ sum -= x[i*x_inc + x_offset];
+ if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+ sx[i] = sum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ for (int i = 0; i < n; ++i) {
+ x[i*x_inc + x_offset] = sx[i];
+ }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 47f98de0..8046c0be 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -52,6 +52,25 @@ void FillMatrix(Queue &queue, const Device &device,
RunKernel(kernel, queue, device, global, local, event, waitForEvents);
}
+// Sets all elements of a vector to a constant value
+template <typename T>
+void FillVector(Queue &queue, const Device &device,
+ const Program &program, const Database &,
+ EventPointer event, const std::vector<Event> &waitForEvents,
+ const size_t n, const size_t inc, const size_t offset,
+ const Buffer<T> &dest,
+ const T constant_value) {
+ auto kernel = Kernel(program, "FillVector");
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, static_cast<int>(inc));
+ kernel.SetArgument(2, static_cast<int>(offset));
+ kernel.SetArgument(3, dest());
+ kernel.SetArgument(4, GetRealArg(constant_value));
+ auto local = std::vector<size_t>{64};
+ auto global = std::vector<size_t>{Ceil(n, 64)};
+ RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+}
+
// =================================================================================================
// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 7b4c2e8f..26eed5e0 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -25,6 +25,7 @@ Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>(), {}, {
#include "../../kernels/level2/xgemv.opencl"
#include "../../kernels/level2/xgemv_fast.opencl"
+ #include "../../kernels/level2/xtrsv.opencl"
}) {
}
diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp
index d5d5a7ca..93f4174f 100644
--- a/src/routines/level2/xtrsv.cpp
+++ b/src/routines/level2/xtrsv.cpp
@@ -25,6 +25,59 @@ Xtrsv<T>::Xtrsv(Queue &queue, EventPointer event, const std::string &name):
Xgemv<T>(queue, event, name) {
}
+// TODO: Temporary variable, put in database instead
+constexpr size_t TRSV_BLOCK_SIZE = 9;
+
+// =================================================================================================
+
+template <typename T>
+void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+
+ if (n > TRSV_BLOCK_SIZE) { throw BLASError(StatusCode::kUnexpectedError); };
+
+ // Retrieves the program from the cache
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
+
+ // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel
+ const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1;
+ const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) ||
+ (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1;
+
+ // The data is either in the upper or lower triangle
+ auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) ||
+ (triangle == Triangle::kLower && a_transpose != Transpose::kNo));
+
+ // Retrieves the kernel from the compiled binary
+ const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward";
+ auto kernel = Kernel(program, kernel_name);
+
+ // Sets the kernel arguments
+ kernel.SetArgument(0, static_cast<int>(n));
+ kernel.SetArgument(1, a_buffer());
+ kernel.SetArgument(2, static_cast<int>(a_offset));
+ kernel.SetArgument(3, static_cast<int>(a_ld));
+ kernel.SetArgument(4, b_buffer());
+ kernel.SetArgument(5, static_cast<int>(b_offset));
+ kernel.SetArgument(6, static_cast<int>(b_inc));
+ kernel.SetArgument(7, x_buffer());
+ kernel.SetArgument(8, static_cast<int>(x_offset));
+ kernel.SetArgument(9, static_cast<int>(x_inc));
+ kernel.SetArgument(10, static_cast<int>(is_transposed));
+ kernel.SetArgument(11, static_cast<int>(is_unit_diagonal));
+
+ // Launches the kernel
+ const auto local = std::vector<size_t>{1};
+ const auto global = std::vector<size_t>{1};
+ auto event = Event();
+ RunKernel(kernel, queue_, device_, global, local, event.pointer());
+ event.WaitForCompletion();
+}
+
// =================================================================================================
// The main routine
@@ -33,24 +86,84 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle,
const Transpose a_transpose, const Diagonal diagonal,
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc) {
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); }
// Tests the matrix and vector
TestMatrixA(n, n, a_buffer, a_offset, a_ld);
- TestVectorX(n, x_buffer, x_offset, x_inc);
+ TestVectorX(n, b_buffer, b_offset, b_inc);
- // Creates a copy of X: a temporary scratch buffer
- auto scratch_buffer = Buffer<T>(context_, n*x_inc + x_offset);
- x_buffer.CopyTo(queue_, n*x_inc + x_offset, scratch_buffer);
+ // Retrieves the program from the cache
+ const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV");
- // The data is either in the upper or lower triangle
- size_t is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
- (triangle == Triangle::kLower && layout == Layout::kRowMajor));
+ // Creates a copy of B to avoid overwriting input while computing output
+ // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels
+ const auto x_offset = b_offset;
+ const auto x_inc = b_inc;
+ const auto x_size = n*x_inc + x_offset;
+ auto x_buffer = Buffer<T>(context_, x_size);
+ b_buffer.CopyTo(queue_, x_size, x_buffer);
+
+ // Fills the output buffer with zeros
+ auto eventWaitList = std::vector<Event>();
+ auto fill_vector_event = Event();
+ FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList,
+ n, x_inc, x_offset, x_buffer, ConstantZero<T>());
+ fill_vector_event.WaitForCompletion();
+
+ // TODO: Not working for row-major at the moment
+ const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kRowMajor) ||
+ (a_transpose != Transpose::kNo && layout != Layout::kRowMajor));
+
+ // Loops over the blocks
+ auto col = n; // the initial column position
+ for (auto i = size_t{0}; i < n; i += TRSV_BLOCK_SIZE) {
+ const auto block_size = std::min(TRSV_BLOCK_SIZE, n - i);
+
+ if (!is_transposed) {
+
+ // Sets the offsets for upper or lower triangular
+ col = (triangle == Triangle::kUpper) ? col - block_size : i;
+ const auto extra_offset_a = (triangle == Triangle::kUpper) ? col + (col+block_size)*a_ld : col;
+ const auto extra_offset_x = (triangle == Triangle::kUpper) ? (col+block_size)*x_inc : 0;
+ const auto extra_offset_b = col*x_inc;
+
+ // Runs the GEMV routine to compute x' = A * x
+ if (i > 0) {
+ DoGemv(layout, a_transpose, block_size, i, ConstantOne<T>(),
+ a_buffer, a_offset + extra_offset_a, a_ld,
+ x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+ x_buffer, x_offset + extra_offset_b, x_inc );
+ }
+ }
+ else {
+
+ // Sets the offsets for upper or lower triangular
+ col = (triangle == Triangle::kLower) ? col - block_size : i;
+ const auto extra_offset_a = (triangle == Triangle::kLower) ? col+block_size + col*a_ld : col*a_ld;
+ const auto extra_offset_x = (triangle == Triangle::kLower) ? (col+block_size)*x_inc : 0;
+ const auto extra_offset_b = col*x_inc;
+
+ // Runs the GEMV routine to compute x' = A * x
+ if (i > 0) {
+ DoGemv(layout, a_transpose, i, block_size, ConstantOne<T>(),
+ a_buffer, a_offset + extra_offset_a, a_ld,
+ x_buffer, x_offset + extra_offset_x, x_inc, ConstantOne<T>(),
+ x_buffer, x_offset + extra_offset_b, x_inc );
+ }
+ }
+
+ // Runs the triangular substitution for the block size
+ Substitution(layout, triangle, a_transpose, diagonal, block_size,
+ a_buffer, a_offset + col + col*a_ld, a_ld,
+ b_buffer, b_offset + col*b_inc, b_inc,
+ x_buffer, x_offset + col*x_inc, x_inc);
+ }
- // TODO: Implement the routine
+ // Retrieves the results
+ x_buffer.CopyTo(queue_, x_size, b_buffer);
}
// =================================================================================================
diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp
index 4a73b5eb..dc3f32f0 100644
--- a/src/routines/level2/xtrsv.hpp
+++ b/src/routines/level2/xtrsv.hpp
@@ -7,7 +7,9 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file implements the Xtrsv routine.
+// This file implements the Xtrsv routine. It uses a block-algorithm and performs small triangular
+// forward and backward substitutions on the diagonal parts of the matrix in combination with larger
+// GEMV computation on the remainder of the matrix.
//
// =================================================================================================
@@ -25,9 +27,12 @@ class Xtrsv: public Xgemv<T> {
public:
// Uses the generic matrix-vector routine
+ using Xgemv<T>::routine_name_;
using Xgemv<T>::queue_;
using Xgemv<T>::context_;
- using Xgemv<T>::MatVec;
+ using Xgemv<T>::device_;
+ using Xgemv<T>::db_;
+ using Xgemv<T>::DoGemv;
// Constructor
Xtrsv(Queue &queue, EventPointer event, const std::string &name = "TRSV");
@@ -38,6 +43,14 @@ class Xtrsv: public Xgemv<T> {
const size_t n,
const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
+
+ // Performs forward or backward substitution on a small triangular matrix
+ void Substitution(const Layout layout, const Triangle triangle,
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t n,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_inc,
+ const Buffer<T> &x_buffer, const size_t offset_x, const size_t x_inc);
};
// =================================================================================================