diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-04 22:48:06 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-04 22:48:06 +0100 |
commit | c209dd7af90d604c8210cc5680b6c7a50b2b995f (patch) | |
tree | 4987a7af1eb34efd6bb6cfcdf249bff01e0073ec | |
parent | fec8c1a8069a2307b8d3aba118ebb61b94871996 (diff) |
Improved substition kernels a bit; added complex support
-rw-r--r-- | src/kernels/common.opencl | 14 | ||||
-rw-r--r-- | src/kernels/level2/xtrsv.opencl | 86 | ||||
-rw-r--r-- | src/routines/level2/xtrsv.cpp | 2 |
3 files changed, 70 insertions, 32 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index bfa1042d..8e59a5fe 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -176,6 +176,13 @@ R"( #define Add(c, a, b) c = a + b #endif +// Subtracts two complex variables +#if PRECISION == 3232 || PRECISION == 6464 + #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y +#else + #define Subtract(c, a, b) c = a - b +#endif + // Multiply two complex variables (used in the defines below) #if PRECISION == 3232 || PRECISION == 6464 #define MulReal(a, b) a.x*b.x - a.y*b.y @@ -200,6 +207,13 @@ R"( #endif #endif +// The scalar multiply-subtract function +#if PRECISION == 3232 || PRECISION == 6464 + #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b) +#else + #define MultiplySubtract(c, a, b) c -= a * b +#endif + // The scalar division function #if PRECISION == 3232 || PRECISION == 6464 #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl index 01bd6ba5..fd5de200 100644 --- a/src/kernels/level2/xtrsv.opencl +++ b/src/kernels/level2/xtrsv.opencl @@ -41,67 +41,89 @@ void FillVector(const int n, const int inc, const int offset, __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) void trsv_forward(int n, - const __global real *A, const int a_offset, int lda, + const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, __global real *x, const int x_offset, int x_inc, - const int is_transposed, const int is_unit_diagonal) { - __local real sx[TRSV_BLOCK_SIZE]; + const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { + __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; + __local real xlm[TRSV_BLOCK_SIZE]; const int tid = get_local_id(0); + + // Pre-loads the data into local memory if (tid < n) { - sx[tid] = b[tid*b_inc + b_offset]; + Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); + if (is_transposed == 0) { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[i + tid*a_ld + a_offset]; + if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); } + } + } + else { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[tid + i*a_ld + a_offset]; + } + } } barrier(CLK_LOCAL_MEM_FENCE); - for (int i = 0; i < n; ++i) { - if (tid == 0) { - real sum = sx[i]; + + // Computes the result (single-threaded for now) + if (tid == 0) { + for (int i = 0; i < n; ++i) { for (int j = 0; j < i; ++j) { - real a_value; - if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; } - else { a_value = A[j + i*lda + a_offset]; } - sum -= a_value * sx[j]; + MultiplySubtract(xlm[i], alm[i][j], xlm[j]); } - sum -= x[i*x_inc + x_offset]; - if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } - sx[i] = sum; + if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); } } - barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); + + // Stores the results if (tid < n) { - x[tid*x_inc + x_offset] = sx[tid]; + x[tid*x_inc + x_offset] = xlm[tid]; } } __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1))) void trsv_backward(int n, - const __global real *A, const int a_offset, int lda, + const __global real *A, const int a_offset, int a_ld, __global real *b, const int b_offset, int b_inc, __global real *x, const int x_offset, int x_inc, - const int is_trans, const int is_unit_diagonal) { - __local real sx[TRSV_BLOCK_SIZE]; + const int is_transposed, const int is_unit_diagonal, const int do_conjugate) { + __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE]; + __local real xlm[TRSV_BLOCK_SIZE]; const int tid = get_local_id(0); + + // Pre-loads the data into local memory if (tid < n) { - sx[tid] = b[tid*b_inc + b_offset]; + Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]); + if (is_transposed == 0) { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[i + tid*a_ld + a_offset]; + if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); } + } + } + else { + for (int i = 0; i < n; ++i) { + alm[i][tid] = A[tid + i*a_ld + a_offset]; + } + } } barrier(CLK_LOCAL_MEM_FENCE); - for (int i = n - 1; i >= 0; --i) { - if (tid == 0) { - real sum = sx[i]; + + // Computes the result (single-threaded for now) + if (tid == 0) { + for (int i = n - 1; i >= 0; --i) { for (int j = i + 1; j < n; ++j) { - real a_value; - if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; } - else { a_value = A[j + i*lda + a_offset]; } - sum -= a_value * sx[j]; + MultiplySubtract(xlm[i], alm[i][j], xlm[j]); } - sum -= x[i*x_inc + x_offset]; - if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; } - sx[i] = sum; + if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); } } - barrier(CLK_LOCAL_MEM_FENCE); } barrier(CLK_LOCAL_MEM_FENCE); + + // Stores the results if (tid < n) { - x[tid*x_inc + x_offset] = sx[tid]; + x[tid*x_inc + x_offset] = xlm[tid]; } } diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index 6e651e33..b0e4c5ae 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -44,6 +44,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || (a_transpose != Transpose::kNo && layout != Layout::kColMajor)) ? 0 : 1; + const auto do_conjugate = (a_transpose == Transpose::kConjugate) ? 1 : 0; // The data is either in the upper or lower triangle const auto is_upper = ((triangle == Triangle::kUpper && a_transpose == Transpose::kNo) || @@ -66,6 +67,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, kernel.SetArgument(9, static_cast<int>(x_inc)); kernel.SetArgument(10, static_cast<int>(is_transposed)); kernel.SetArgument(11, static_cast<int>(is_unit_diagonal)); + kernel.SetArgument(12, static_cast<int>(do_conjugate)); // Launches the kernel const auto local = std::vector<size_t>{db_["TRSV_BLOCK_SIZE"]}; |