summaryrefslogtreecommitdiff
path: root/src/kernels/level1/xamax.opencl
blob: 7cbbd6b5695897938c6085fab9fcc674849d7ed8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file contains the Xamax kernel. It implements index of (absolute) min/max computation using
// reduction kernels. Reduction is split in two parts. In the first (main) kernel the X vector is
// loaded, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
// is executed with a single workgroup only, computing the final result.
//
// =================================================================================================

// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
// literal). Comment-out this line for syntax-highlighting when developing.
R"(

// Parameters set by the tuner or by the database. Here they are given a basic default value in case
// this kernel file is used outside of the CLBlast library.
#ifndef WGS1
  #define WGS1 64     // The local work-group size of the main kernel
#endif
#ifndef WGS2
  #define WGS2 64     // The local work-group size of the epilogue kernel
#endif

// =================================================================================================

// The main reduction kernel, performing the loading and the majority of the operation
#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#else
  __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
#endif
void Xamax(const int n,
           const __global real* restrict xgm, const int x_offset, const int x_inc,
           __global singlereal* maxgm, __global unsigned int* imaxgm) {
  __local singlereal maxlm[WGS1];
  __local unsigned int imaxlm[WGS1];
  const int lid = get_local_id(0);
  const int wgid = get_group_id(0);
  const int num_groups = get_num_groups(0);

  // Performs loading and the first steps of the reduction
  #if defined(ROUTINE_MAX) || defined(ROUTINE_MIN) || defined(ROUTINE_AMIN)
    singlereal max = SMALLEST;
  #else
    singlereal max = ZERO;
  #endif
  unsigned int imax = 0;
  int id = wgid*WGS1 + lid;
  while (id < n) {
    const int x_index = id*x_inc + x_offset;
    #if PRECISION == 3232 || PRECISION == 6464
      singlereal x = fabs(xgm[x_index].x) + fabs(xgm[x_index].y);
    #else
      singlereal x = xgm[x_index];
    #endif
    #if defined(ROUTINE_MAX) // non-absolute maximum version
      // nothing special here
    #elif defined(ROUTINE_MIN) // non-absolute minimum version
      x = -x;
    #elif defined(ROUTINE_AMIN) // absolute minimum version
      x = -fabs(x);
    #else
      x = fabs(x);
    #endif
    if (x > max) {
      max = x;
      imax = id;
    }
    id += WGS1*num_groups;
  }
  maxlm[lid] = max;
  imaxlm[lid] = imax;
  barrier(CLK_LOCAL_MEM_FENCE);

  // Performs reduction in local memory
  for (int s=WGS1/2; s>0; s=s>>1) {
    if (lid < s) {
      if (maxlm[lid + s] > maxlm[lid]) {
        maxlm[lid] = maxlm[lid + s];
        imaxlm[lid] = imaxlm[lid + s];
      }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }

  // Stores the per-workgroup result
  if (lid == 0) {
    maxgm[wgid] = maxlm[0];
    imaxgm[wgid] = imaxlm[0];
  }
}

// =================================================================================================

// The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
// be launched with a single workgroup only.
#if RELAX_WORKGROUP_SIZE == 1
  __kernel
#else
  __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
#endif
void XamaxEpilogue(const __global singlereal* restrict maxgm,
                   const __global unsigned int* restrict imaxgm,
                   __global unsigned int* imax, const int imax_offset) {
  __local singlereal maxlm[WGS2];
  __local unsigned int imaxlm[WGS2];
  const int lid = get_local_id(0);

  // Performs the first step of the reduction while loading the data
  if (maxgm[lid + WGS2] > maxgm[lid]) {
    maxlm[lid] = maxgm[lid + WGS2];
    imaxlm[lid] = imaxgm[lid + WGS2];
  }
  else {
    maxlm[lid] = maxgm[lid];
    imaxlm[lid] = imaxgm[lid];
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // Performs reduction in local memory
  for (int s=WGS2/2; s>0; s=s>>1) {
    if (lid < s) {
      if (maxlm[lid + s] > maxlm[lid]) {
        maxlm[lid] = maxlm[lid + s];
        imaxlm[lid] = imaxlm[lid + s];
      }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }

  // Stores the final result
  if (lid == 0) {
    imax[imax_offset] = imaxlm[0];
  }
}

// =================================================================================================

// End of the C++11 raw string literal
)"

// =================================================================================================