summaryrefslogtreecommitdiff
path: root/src/kernels/level2/xtrsv.opencl
blob: 7677377e10422c45a8f6ffbe9c7c90a34bef1982 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains kernels to perform forward or backward substition, as used in the TRSV routine
//
// =================================================================================================

// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(

// =================================================================================================
#if defined(ROUTINE_TRSV)

__kernel
void FillVector(const int n, const int inc, const int offset,
                __global real* restrict dest, const real_arg arg_value) {
  const real value = GetRealArg(arg_value);
  const int tid = get_global_id(0);
  if (tid < n) {
    dest[tid*inc + offset] = value;
  }
}

// =================================================================================================

// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.

#ifndef TRSV_BLOCK_SIZE
  #define TRSV_BLOCK_SIZE 32    // The block size for forward or backward substition
#endif

// =================================================================================================

#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#else
  __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
#endif
void trsv_forward(int n,
                  const __global real *A, const int a_offset, int a_ld,
                  __global real *b, const int b_offset, int b_inc,
                  __global real *x, const int x_offset, int x_inc,
                  const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
  __local real xlm[TRSV_BLOCK_SIZE];
  const int tid = get_local_id(0);

  // Pre-loads the data into local memory
  if (tid < n) {
    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
    if (is_transposed == 0) {
      for (int i = 0; i < n; ++i) {
        alm[i][tid] = A[i + tid*a_ld + a_offset];
      }
    }
    else {
      for (int i = 0; i < n; ++i) {
        alm[i][tid] = A[tid + i*a_ld + a_offset];
      }
    }
    if (do_conjugate) {
      for (int i = 0; i < n; ++i) {
        COMPLEX_CONJUGATE(alm[i][tid]);
      }
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // Computes the result (single-threaded for now)
  if (tid == 0) {
    for (int i = 0; i < n; ++i) {
      for (int j = 0; j < i; ++j) {
        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
      }
      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // Stores the results
  if (tid < n) {
    x[tid*x_inc + x_offset] = xlm[tid];
  }
}

#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#else
  __kernel __attribute__((reqd_work_group_size(TRSV_BLOCK_SIZE, 1, 1)))
#endif
void trsv_backward(int n,
                   const __global real *A, const int a_offset, int a_ld,
                   __global real *b, const int b_offset, int b_inc,
                   __global real *x, const int x_offset, int x_inc,
                   const int is_transposed, const int is_unit_diagonal, const int do_conjugate) {
  __local real alm[TRSV_BLOCK_SIZE][TRSV_BLOCK_SIZE];
  __local real xlm[TRSV_BLOCK_SIZE];
  const int tid = get_local_id(0);

  // Pre-loads the data into local memory
  if (tid < n) {
    Subtract(xlm[tid], b[tid*b_inc + b_offset], x[tid*x_inc + x_offset]);
    if (is_transposed == 0) {
      for (int i = 0; i < n; ++i) {
        alm[i][tid] = A[i + tid*a_ld + a_offset];
      }
    }
    else {
      for (int i = 0; i < n; ++i) {
        alm[i][tid] = A[tid + i*a_ld + a_offset];
      }
    }
    if (do_conjugate) {
      for (int i = 0; i < n; ++i) {
        COMPLEX_CONJUGATE(alm[i][tid]);
      }
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // Computes the result (single-threaded for now)
  if (tid == 0) {
    for (int i = n - 1; i >= 0; --i) {
      for (int j = i + 1; j < n; ++j) {
        MultiplySubtract(xlm[i], alm[i][j], xlm[j]);
      }
      if (is_unit_diagonal == 0) { DivideFull(xlm[i], xlm[i], alm[i][i]); }
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // Stores the results
  if (tid < n) {
    x[tid*x_inc + x_offset] = xlm[tid];
  }
}

#endif
// =================================================================================================

// End of the C++11 raw string literal
)"

// =================================================================================================