summaryrefslogtreecommitdiff
path: root/src/kernels/level2/xtrsv.opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level2/xtrsv.opencl')
-rw-r--r--src/kernels/level2/xtrsv.opencl94
1 files changed, 94 insertions, 0 deletions
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
new file mode 100644
index 00000000..00f29e47
--- /dev/null
+++ b/src/kernels/level2/xtrsv.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRSV)
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
+void FillVector(const int n, const int inc, const int offset,
+ __global real* restrict dest, const real_arg arg_value) {
+ const real value = GetRealArg(arg_value);
+ const int tid = get_global_id(0);
+ if (tid < n) {
+ dest[tid*inc + offset] = value;
+ }
+}
+
+// =================================================================================================
+
+// TODO: Put variable in database
+#define TRSV_BLOCK_SIZE 256
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_forward(int n,
+ const __global float *A, const int a_offset, int lda,
+ __global float *b, const int b_offset, int b_inc,
+ __global float *x, const int x_offset, int x_inc,
+ const int is_transposed, const int is_unit_diagonal) {
+ __local float sx[TRSV_BLOCK_SIZE];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int i = 0; i < n; ++i) {
+ real sum = b[i*b_inc + b_offset];
+ for (int j = 0; j < i; ++j) {
+ real a_value;
+ if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
+ else { a_value = A[j + i*lda + a_offset]; }
+ sum -= a_value * sx[j];
+ }
+ sum -= x[i*x_inc + x_offset];
+ if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+ sx[i] = sum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ for (int i = 0; i < n; ++i) {
+ x[i*x_inc + x_offset] = sx[i];
+ }
+}
+
+__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
+void trsv_backward(int n,
+ const __global float *A, const int a_offset, int lda,
+ __global float *b, const int b_offset, int b_inc,
+ __global float *x, const int x_offset, int x_inc,
+ const int is_trans, const int is_unit_diagonal) {
+ __local float sx[TRSV_BLOCK_SIZE];
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (int i = n - 1; i >= 0; --i) {
+ real sum = b[i*b_inc + b_offset];
+ for (int j = i + 1; j < n; ++j) {
+ real a_value;
+ if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
+ else { a_value = A[j + i*lda + a_offset]; }
+ sum -= a_value * sx[j];
+ }
+ sum -= x[i*x_inc + x_offset];
+ if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
+ sx[i] = sum;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ for (int i = 0; i < n; ++i) {
+ x[i*x_inc + x_offset] = sx[i];
+ }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================