Improved substition kernels a bit; added complex support

author: Cedric Nugteren <web@cedricnugteren.nl> 2017-02-04 22:48:06 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2017-02-04 22:48:06 +0100
commit: c209dd7af90d604c8210cc5680b6c7a50b2b995f (patch)
tree: 4987a7af1eb34efd6bb6cfcdf249bff01e0073ec /src/kernels
parent: fec8c1a8069a2307b8d3aba118ebb61b94871996 (diff)
2 files changed, 68 insertions, 32 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index bfa1042d..8e59a5fe 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -176,6 +176,13 @@ R"(
   #define Add(c, a, b) c = a + b
 #endif
 
+// Subtracts two complex variables
+#if PRECISION == 3232 || PRECISION == 6464
+  #define Subtract(c, a, b) c.x = a.x - b.x; c.y = a.y - b.y
+#else
+  #define Subtract(c, a, b) c = a - b
+#endif
+
 // Multiply two complex variables (used in the defines below)
 #if PRECISION == 3232 || PRECISION == 6464
   #define MulReal(a, b) a.x*b.x - a.y*b.y
@@ -200,6 +207,13 @@ R"(
   #endif
 #endif
 
+// The scalar multiply-subtract function
+#if PRECISION == 3232 || PRECISION == 6464
+  #define MultiplySubtract(c, a, b) c.x -= MulReal(a,b); c.y -= MulImag(a,b)
+#else
+  #define MultiplySubtract(c, a, b) c -= a * b
+#endif
+
 // The scalar division function
 #if PRECISION == 3232 || PRECISION == 6464
   #define DivideReal(c, a, b) c.x = a.x / b.x; c.y = a.x
diff --git a/src/kernels/level2/xtrsv.opencl b/src/kernels/level2/xtrsv.opencl
index 01bd6ba5..fd5de200 100644
--- a/src/kernels/level2/xtrsv.opencl
+++ b/src/kernels/level2/xtrsv.opencl
@@ -41,67 +41,89 @@ void FillVector(const int n, const int inc, const int offset,
 
 __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_forward(int n,
-                  const __global real *A, const int a_offset, int lda,
+                  const __global real *A, const int a_offset, int a_ld,
                   __global real *b, const int b_offset, int b_inc,
                   __global real *x, const int x_offset, int x_inc,
-                  const int is_transposed, const int is_unit_diagonal) {
-  __local real sx[TRSV_BLOCK_SIZE];
+                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
   const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
   if (tid < n) {
-    sx[tid] = b[tid*b_inc + b_offset];
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for (int i = 0; i < n; ++i) {
-    if (tid == 0) {
-      real sum = sx[i];
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = 0; i < n; ++i) {
       for (int j = 0; j < i; ++j) {
-        real a_value;
-        if (is_transposed == 0) { a_value = A[i + j*lda + a_offset]; }
-        else { a_value = A[j + i*lda + a_offset]; }
-        sum -= a_value * sx[j];
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
       }
-      sum -= x[i*x_inc + x_offset];
-      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-      sx[i] = sum;
+      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
   if (tid < n) {
-    x[tid*x_inc + x_offset] = sx[tid];
+    x[tid*x_inc + x_offset] = xlm[tid];
   }
 }
 
 __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
 void trsv_backward(int n,
-                   const __global real *A, const int a_offset, int lda,
+                   const __global real *A, const int a_offset, int a_ld,
                    __global real *b, const int b_offset, int b_inc,
                    __global real *x, const int x_offset, int x_inc,
-                   const int is_trans, const int is_unit_diagonal) {
-  __local real sx[TRSV_BLOCK_SIZE];
+                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
+  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
+  __local real xlm[TRSV_BLOCK_SIZE];
   const int tid = get_local_id(0);
+
+  // Pre-loads the data into local memory
   if (tid < n) {
-    sx[tid] = b[tid*b_inc + b_offset];
+    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
+    if (is_transposed == 0) {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[i + tid*a_ld + a_offset];
+        if (do_conjugate) { COMPLEX_CONJUGATE(alm[i][tid]); }
+      }
+    }
+    else {
+      for (int i = 0; i < n; ++i) {
+        alm[i][tid] = A[tid + i*a_ld + a_offset];
+      }
+    }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-  for (int i = n - 1; i >= 0; --i) {
-    if (tid == 0) {
-      real sum = sx[i];
+
+  // Computes the result (single-threaded for now)
+  if (tid == 0) {
+    for (int i = n - 1; i >= 0; --i) {
       for (int j = i + 1; j < n; ++j) {
-        real a_value;
-        if (is_trans == 0) { a_value = A[i + j*lda + a_offset]; }
-        else { a_value = A[j + i*lda + a_offset]; }
-        sum -= a_value * sx[j];
+        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
       }
-      sum -= x[i*x_inc + x_offset];
-      if (is_unit_diagonal == 0) { sum /= A[i + i*lda + a_offset]; }
-      sx[i] = sum;
+      if (is_unit_diagonal == 0) { DivideReal(xlm[i], xlm[i], alm[i][i]); }
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Stores the results
   if (tid < n) {
-    x[tid*x_inc + x_offset] = sx[tid];
+    x[tid*x_inc + x_offset] = xlm[tid];
   }
 }
author	Cedric Nugteren <web@cedricnugteren.nl>	2017-02-04 22:48:06 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2017-02-04 22:48:06 +0100
commit	c209dd7af90d604c8210cc5680b6c7a50b2b995f (patch)
tree	4987a7af1eb34efd6bb6cfcdf249bff01e0073ec /src/kernels
parent	fec8c1a8069a2307b8d3aba118ebb61b94871996 (diff)