summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-02-04 22:48:06 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-02-04 22:48:06 +0100
commitc209dd7af90d604c8210cc5680b6c7a50b2b995f (patch)
tree4987a7af1eb34efd6bb6cfcdf249bff01e0073ec /src/kernels
parentfec8c1a8069a2307b8d3aba118ebb61b94871996 (diff)
Improved substition kernels a bit; added complex support
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/common.opencl14
-rw-r--r--src/kernels/level2/xtrsv.opencl86
2 files changed, 68 insertions, 32 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index bfa1042d..8e59a5fe 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -176,6 +176,13 @@ R"(
#define Add(c, a, b) c = a + b
#endif
+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+ #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+ #define Subtract(c, a, b) c = a - b
+#endif
+
// Multiply two complex variables (used in the defines below)
#if PRECISION == 3232 || PRECISION == 6464
#define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -200,6 +207,13 @@ R"(
#endif
#endif
+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+ #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+ #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
// The scalar division function
#if PRECISION == 3232 || PRECISION == 6464
#define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 01bd6ba5..fd5de200 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -41,67 +41,89 @@ void FillVector(const int n, const int inc, const int offset,
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_forward(int n,
- const __global real *A, const int a_offset, int lda,
+ const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
- const int is_transposed, const int is_unit_diagonal) {
- __local real sx[TRSV_BLOCK_SIZE];
+ const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+ __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+ __local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
+
+ // Pre-loads the data into local memory
if (tid < n) {
- sx[tid] = b[tid*b_inc + b_offset];
+ Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+ if (is_transposed == 0) {
+ for (int i = 0; i < n; ++i) {
+ alm[i][tid] = A[i + tid*a_ld + a_offset];
+ if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+ }
+ }
+ else {
+ for (int i = 0; i < n; ++i) {
+ alm[i][tid] = A[tid + i*a_ld + a_offset];
+ }
+ }
}
barrier(CLK_LOCAL_MEM_FENCE);
- for (int i = 0; i < n; ++i) {
- if (tid == 0) {
- real sum = sx[i];
+
+ // Computes the result (single-threaded for now)
+ if (tid == 0) {
+ for (int i = 0; i < n; ++i) {
for (int j = 0; j < i; ++j) {
- real a_value;
- if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
- else { a_value = A[j + i*lda + a_offset]; }
- sum -= a_value * sx[j];
+ MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
- sum -= x[i*x_inc + x_offset];
- if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
- sx[i] = sum;
+ if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
}
- barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Stores the results
if (tid < n) {
- x[tid*x_inc + x_offset] = sx[tid];
+ x[tid*x_inc + x_offset] = xlm[tid];
}
}
__kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
void trsv_backward(int n,
- const __global real *A, const int a_offset, int lda,
+ const __global real *A, const int a_offset, int a_ld,
__global real *b, const int b_offset, int b_inc,
__global real *x, const int x_offset, int x_inc,
- const int is_trans, const int is_unit_diagonal) {
- __local real sx[TRSV_BLOCK_SIZE];
+ const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+ __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+ __local real xlm[TRSV_BLOCK_SIZE];
const int tid = get_local_id(0);
+
+ // Pre-loads the data into local memory
if (tid < n) {
- sx[tid] = b[tid*b_inc + b_offset];
+ Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+ if (is_transposed == 0) {
+ for (int i = 0; i < n; ++i) {
+ alm[i][tid] = A[i + tid*a_ld + a_offset];
+ if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+ }
+ }
+ else {
+ for (int i = 0; i < n; ++i) {
+ alm[i][tid] = A[tid + i*a_ld + a_offset];
+ }
+ }
}
barrier(CLK_LOCAL_MEM_FENCE);
- for (int i = n - 1; i >= 0; --i) {
- if (tid == 0) {
- real sum = sx[i];
+
+ // Computes the result (single-threaded for now)
+ if (tid == 0) {
+ for (int i = n - 1; i >= 0; --i) {
for (int j = i + 1; j < n; ++j) {
- real a_value;
- if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
- else { a_value = A[j + i*lda + a_offset]; }
- sum -= a_value * sx[j];
+ MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
}
- sum -= x[i*x_inc + x_offset];
- if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
- sx[i] = sum;
+ if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
}
- barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Stores the results
if (tid < n) {
- x[tid*x_inc + x_offset] = sx[tid];
+ x[tid*x_inc + x_offset] = xlm[tid];
}
}