summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/include/dblock_kgen.h
blob: 88ddd5ee59ddbb7ea22993dd11908126e50e1c93 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* ************************************************************************
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ************************************************************************/


/*
 * Common generators for functions manipulating
 * with data blocks placed in the global, local,
 * or private memory.
 */

/*
 * TODO: add the unroll option to 'rwMatrBlockGen'
 *       and 'smulMatrBlockGen'
 */

#ifndef DBLOCK_KGEN_H_
#define DBLOCK_KGEN_H_

#include <cltypes.h>
#include <kerngen.h>

/**
 * @internal
 * @defgroup MAJOR_GENS Major common used generators
 */
/*@{*/

/**
 * @internal
 * @brief Data block copying directions
 */
typedef enum DBlockCopyDirection {
    /** Copy from the global to the local memory */
    DBLOCK_GLOBAL_TO_LOCAL,
    /** Copy from the local to the global memory */
    DBLOCK_LOCAL_TO_GLOBAL,
    /** Copy from the global memory to an image */
    DBLOCK_GLOBAL_TO_IMAGE,
    /** Copy from the local memory to an image */
    DBLOCK_LOCAL_TO_IMAGE
} DBlockCopyDirection;

/**
 * @internal
 * @brief Data block copying flags
 */
typedef enum DBlockCopyFlags {
    DBLOCK_COPY_TRANSPOSE = 0x1,        /**< Transpose 2D block */
    /** pack several rows into single image row */
    DBLOCK_COPY_PACKED_IMAGE = 0x2,
    DBLOCK_COPY_CONJUGATE = 0x4,        /**< Conjugate complex elements */
    DBLOCK_COPY_NOT_VECTORIZE = 0x8     /**< Disable vectorized copying */
} DBlockCopyFlags;

/**
 * @internal
 * @brief Generator to copy data blocks between different kinds
 *        of memory
 *
 * @param[out] ctx              Generator context
 * @param[in] dim               Subproblem dimension to generate a function for
 * @param[in] pgran             Data parallelism granularity
 * @param[in] dtype             Data type
 * @param[in] dir               Copying direction
 * @param[in] flags             Copying flags; when an image is used as destination
 *                              block transposing is prohibited
 *
 * If 'dim' is set to NULL a generic version working with subproblem
 * of any dimension is generated. In the case specific work group
 * sizes are ignored, only work group dimension is used.
 *
 * 'x' field of the passed SuproblemDim structure should contain
 *     the block width
 * 'y' should contain the block height
 *
 * Copied blocks can be as well one as two dimensional. For any one
 * dimensional block 'y' field of the dimension structure should be
 * set to 1. If a block is two dimensional, and the local memory is \n
 * the source or destination memory, the block's rows must be aligned
 * to float4 boundary.
 *
 * Rows of the matrix block must be aligned to float4 boundary. \n
 *
 * Generated functions have the following definitions: \n
 *\n
 * Buffer-buffer copying function for optimal blocks: \n
 * @code
 * void
 * funcName(
 *     <Unified pointer type> dst,
 *     <Unified pointer type> src,
 *     size_t startRow,
 *     size_t startCol,
 *     size_t ld)
 * @endcode
 *
 * The unified pointer types can be GPtr if the global memory is used or LPtr
 * is the local memory is used respectively
 * (See the "Data types in kernels" section). Function naming rule is follow: \n
 * (type prefix)copyDBlock['Transp']['Conj']['Nvec'](src mem][dst mem]
 * [block height][block width] \n
 * The 'Nvec' suffix is added if vectorized copying is prohibited.\n
 *\n
 * Buffer-buffer copying function, generic version: \n
 * @code
 * void
 * funcName(
 *     <Unified pointer type> dst,
 *     <Unified pointer type> src,
 *     size_t startRow,
 *     size_t startCol,
 *     size_t nrRows,
 *     size_t nrCols,
 *     size_t dstLD,
 *     size_t srcLD)
 * @endcode
 *
 * Here "dstLD" is destination leading dimension, "srcLD" - source leading
 * dimension. \n
 * Naming rule is the same as for the function above except block sizes. \n
 *\n
 * Function copying optimal blocks from the global memory to an image: \n
 * @code
 * void
 * funcName(
 *     __write_only image2d_t dst,
 *     size_t startX,
 *     size_t startY,
 *     GPtr src,
 *     size_t startRow,
 *     size_t startCol,
 *     size_t ld)
 * @endcode
 * 'start' and 'startY' arguments is start X and Y coordinate in the image to
 * write from. The generic version has the analogous definition, and takes two
 * additional arguments 'nrRows' and 'nrCols' of the size_t type following just
 * fter the 'startCol' argument. \n
 *\n
 * Function copying optimal blocks from the local memory to an image: \n
 * @code
 * void
 * funcName(
 *     __write_only image2d_t dst,
 *     size_t startX,
 *     size_t startY,
 *     LPtr src)
 * @endcode
 * The generic version takes two additional arguments 'nrRows' and 'nrCols' of the
 * size_t type following just after the 'src' argument.
 *
 * @return 0 on success; on error returns negated error code:
 *
 *      - -EINVAL: unsupported data type is passed, or
 *               'DBLOCK_COPY_TRANSPOSE' is set when
 *               an image is used as destination
 *      - -ENOTSUP: unsupported copying direction is passed
 *      - -EOVEFFLOW: code buffer overflowed
 */
int
copyDataBlockGen(
    struct KgenContext *ctx,
    const SubproblemDim *dim,
    const PGranularity *pgran,
    DataType dtype,
    DBlockCopyDirection dir,
    DBlockCopyFlags flags);

/*@}*/

/*
 * Zero data block in the local or global memory
 *
 * @ctx: generator context
 * @dim: Subproblem dimension to generate the function for
 * @pgran: data parallelism granularity
 * @memPrefix: type of memory to generate the function for
 *
 * The 'memPrefix' field of the passed BlasKernExtra structure
 * should contain the type of memory the buffer is stored in.
 * It cane take one of the "__local", or the "__global" value.
 *
 * 'x' field of the passed SuproblemDim structure should contain
 * the block width in float4 words. In the case the function takes only
 * a buffer pointer. If the field is set to 'SUBDIM_UNUSED'
 * the function is generated without any loop unrollings. In the case
 * the function takes buffer length as the second argument.
 *
 * If 'unroll' is set, the 'bwidth' field of the structure should
 * contain the maximum width of a block zeroed with loop unrolling.
 * If 'unroll' is set but the 'bwidth' is set to 'SUBDIM_UNUSED',
 * the generator don't apply any restriction to loop unrolling.
 * The parameter is ignored if the 'x' field of the 'dim' is set to
 * 'SUBDIM_UNUSED'.
 *
 * On success returns 0, on error returns negated error code:
 *
 *      -EINVAL: wrong memory prefix is passed
 *      -EOVEFFLOW: code buffer overflowed
 */
int
f4zeroBlockGen(
    struct KgenContext *ctx,
    const SubproblemDim *dim,
    const PGranularity *pgran,
    const char *memPrefix);

#endif /* DBLOCK_KGEN_H_ */