summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/blas_kgen.h
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/blas_kgen.h')
-rw-r--r--external/clBLAS/src/library/blas/gens/blas_kgen.h910
1 files changed, 0 insertions, 910 deletions
diff --git a/external/clBLAS/src/library/blas/gens/blas_kgen.h b/external/clBLAS/src/library/blas/gens/blas_kgen.h
deleted file mode 100644
index 6fb1410d..00000000
--- a/external/clBLAS/src/library/blas/gens/blas_kgen.h
+++ /dev/null
@@ -1,910 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * Something specific for BLAS generators
- *
- * NOTE:
- * 1) All the blas kernel generators should
- * perceive fields of the SubproblemDim
- * structure as following:
- * 'y' - rows of matrix A, i. e. M dimension
- * of matrix C
- * 'x' - columns of matrix B and C
- * 'bwidth' - block width in K dimension
- *
- * 2) At generating copying functions and their calls one should
- * keep in mind, all the matrix blocks are copied in
- * the local memory such that sequentially accessed elements
- * are located in memory sequentially. In this context
- * transposing is perceived as transposing at copying
- * to/from the local memory, not matrix storage way in
- * the array passed to kernel.
- */
-
-#ifndef BLAS_KGEN_H_
-#define BLAS_KGEN_H_
-
-#include <clBLAS.h>
-
-#include <cltypes.h>
-#include <kerngen.h>
-#include <mempat.h>
-#include <dblock_kgen.h>
-
-#include <blas_funcs.h>
-#include <matrix_props.h>
-
-#include "tile.h"
-#include "fetch.h"
-
-
-#define BLAS_KGEN_FORMAT 1
-
-#define genInternalLoopEnd(ctx) kgenEndBranch(ctx, NULL)
-
-enum {
- MAX_OPENCL_VECTOR_LENGTH = 16
-};
-
-typedef enum TailFetch {
- FETCH_NO_TAILS = 0,
- FETCH_TAIL_ROW = 0x01,
- FETCH_TAIL_COL = 0x02
-} TailFetch;
-
-/**
- * @internal
- * @brief Blas generator flags
- * @ingroup GEN_SETTINGS
- */
-typedef enum BlasGenFlags {
- BGF_EXPLICIT_INLINE = 0x01,
- BGF_DISTINCT_VECLEN = 0x02,
- // TODO: replace with a flags with inverse semantics
- BGF_WHOLE_A = 0x04,
- /** Leading dimension are in vectors rather than in elements */
- BGF_LD_IN_VECTORS = 0x08,
- /**
- * Objects in the global memory are accessed through the unified pointers.
- * This feature is deprecated and should be not used in new generators.
- * It is left for backward compatibility
- */
- BGF_UPTRS = 0x10
-} BlasGenFlags;
-
-/**
- * @internal
- * @brief Flags showing how problem tails are handled
- * @ingroup TAILS_HANDLING
- */
-typedef enum TailStatus {
- /** Tail of the matrix A is raised */
- TAIL_A_RAISED = 0x01,
- /** Tail of the matrix B is raised */
- TAIL_B_RAISED = 0x02
-} TailStatus;
-
-/**
- * @internal
- * @brief Tiles multiplier flags
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef enum TileMulFlags {
- TILEMUL_NO_FLAGS = 0, /**< No flags */
- TILEMUL_TRA = 0x01, /**< Transposed matrix A */
- TILEMUL_TRB = 0x02, /**< Transposed matrix B */
- TILEMUL_CONJA = 0x04, /**< Conjugated elements of A */
- TILEMUL_CONJB = 0x08, /**< Conjugated elements of B */
- TILEMUL_C_COLUMN_MAJOR = 0x10, /**< Column major block for matrix C */
- TILEMUL_NOT_FETCH_B = 0x20, /**< Do not fetch matrix B block */
- TILEMUL_EXTERN_RDECL = 0x40, /**< External register tiles declaration,
- the generator must not declare them
- itself */
-
- /**
- * Deprecated. Use the repsective mode being a part of FetchAddr mode.
- * He is left just for backward compatibility to don't break the working
- * code and will be removed soon
- */
- TILEMUL_WRAP_AROUND_TAIL = 0x80, /**< Sizes used for column skew are
- rounded to next vecLen bound */
- /** Use global cyclic along subproblem A coordinate.
- * Deprecated. Don't use it */
- TILEMUL_GLOBAL_CYCLIC_A = 0x100,
- /** Use global cyclic along subproblem B coordinate.
- * Deprecated don't use it */
- TILEMUL_GLOBAL_CYCLIC_B = 0x200,
- /* Deprecated. Don't use it */
- TILEMUL_GLOBAL_CYCLIC_K = 0x400, /**< Use global cyclic along K */
- /** Use skew along subproblem A coordinate */
- TILEMUL_SKEW_A = 0x800,
- /** Use skew along subproblem B coordinate. Deprecated */
- TILEMUL_SKEW_B = 0x1000,
- /* Deprecated */
- TILEMUL_SKEW_K = 0x2000, /**< Use skew along K */
- /** Use size of whole matrix for cyclic addressing. Deprecated */
- TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A |
- TILEMUL_GLOBAL_CYCLIC_B |
- TILEMUL_GLOBAL_CYCLIC_K,
- // Deprecated
- TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K,
- /** Optimize coordinates calculations by storing coordinates values */
- // Deprecated
- TILEMUL_OPTIMIZE_COORD_CALC = 0x4000,
- /** Use bwidth0 stride */
- TILEMUL_BW_STRIDE = 0x8000,
- /** Optimize coordinates calculations by using vectors
- * and pointer increments */
- // Deprecated
- TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000,
- /** Do not increment K*/
- TILEMUL_NOT_INC_K = 0x20000,
- /**
- * Use variants with explicit vectorization. Useful on platforms with
- * true SIMD.
- */
- TILEMUL_FORCE_VECTORIZATION = 0x40000
-} TileMulFlags;
-
-
-/**
- * @internal
- * @brief Tiles multiplier core
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef enum TileMulCore {
- /** Use multiplication and addition operations */
- TILEMUL_MULADD,
- /** Use the 'dot' function where possible */
- TILEMUL_DOT,
- /** Use the 'mad' function */
- TILEMUL_MAD
-} TileMulCore;
-
-/**
- * @internal
- * @brief Update result operations
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef enum UpdateResultOp {
- /** Just set the values stored in a target buffer */
- UPRES_SET,
- /** Summarize values stored in a target buffer with the temporary result */
- UPRES_SUM
-} UpdateResultOp;
-
-/**
- * @internal
- * @brief Update result generator flags
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef enum UpdateResultFlags {
- /** Resulting matrix is stored in the column major form */
- UPRES_COLUMN_MAJOR = 0x01,
- /** Generic version, non optimal sizes */
- UPRES_GENERIC = 0x02,
- /** Multiply result on beta */
- UPRES_WITH_BETA = 0x04,
- /** do not multiply on the alpha scalar */
- UPRES_WITHOUT_ALPHA = 0x08,
- /**
- * Destination is private memory;
- * if not set destination is in the global one
- */
- UPRES_PRIV_DEST = 0x10,
- /** Use the local memory instead the global memory */
- UPRES_USE_LDS = 0x20,
- /** Generate the inline version */
- UPRES_INLINE = 0x40,
- /** Disable vectorization at memory access */
- UPRES_NO_VECTORIZATION = 0x80,
- /** For the generic version useful data reside at the tile rows' tail */
- UPRES_TAIL_ROW = 0x100,
- /** For the generic version useful data reside at the tile columns' tail */
- UPRES_TAIL_COL = 0x200,
- /** Generate condition whether coordinates don't exceed problem bounds */
- UPRES_EXCEED_PROBLEM_CONDITION = 0x400,
- /****/
- UPRES_INDEXING_WITH_CONSTANTS = 0x800,
- /** Write result to C instead of B for functions with triangular matrix */
- UPRES_TRIANG_WRITE_C = 0x1000
-} UpdateResultFlags;
-
-typedef struct PrivateArea {
- const char *typeName;
- unsigned int vecLen;
- unsigned int size;
-} PrivateArea;
-
-/**
- * @internal
- * @defgroup GEN_SETTINGS Generator settings
- * @ingroup BLAS_GENERATORS
- */
-/*@{*/
-
-/**
- * @internal
- * @brief Kernel variable and argument names
- */
-typedef struct KernelVarNames {
- const char *A; /**< Matrix A variable name */
- const char *B; /**< Matrix B variable name */
- const char *C;
- const char *LDS; /**< LDS pointer name */
- const char *coordA; /**< Variable for subproblem A coordinate */
- const char *coordB; /**< Variable for subproblem B coordinate */
- const char *k; /**< Variable for incrementable K offset value*/
- const char *skewA; /**< Variable for skews along A */
- const char *skewB; /**< Variable for skews along B */
- const char *skewK; /**< Variable for skews along K */
- const char *sizeM; /**< Matrix A size M */
- const char *sizeN; /**< Matrix B size N */
- const char *sizeK; /**< Matrixes size K */
- const char *lda; /**< Leading dimension of matrix A */
- const char *ldb; /**< Leading dimension of matrix B */
- const char *ldc; /**< Leading dimension of matrix C, in vectors */
- const char *vectCoordA; /**< Vector containing indexes of tile a elements
- in matrix A */
- const char *vectCoordB; /**< Vector containing indexes of tile b elements
- in matrix B*/
- const char *startM;
- const char *startN;
- const char *startK;
- const char *alpha;
- const char *beta;
-} KernelVarNames;
-
-/**
- * @internal
- * @brief Blas generator settings
- *
- * This structure is designed to be used with most of subgenerators
- * and generator helpers. It is assumed to be initialized once at the
- * generator beginning and modified as few as possible over the rest of
- * the process.
- */
-typedef struct BlasGenSettings {
- /**
- * Subproblem dimensions:
- *
- * work group dimensions are at index 0
- * work item dimensions are at index 1
- */
- SubproblemDim subdims[2];
- const PGranularity *pgran; /**< Data parallelism granularity */
- const CLBLASKernExtra *kextra; /**< Kernel extra */
- BlasGenFlags flags; /**< Global generator flags */
- KernelVarNames varNames; /**< Kernel variables and argument names */
- Tile tileA;
- Tile tileBX;
- Tile tileCY;
-} BlasGenSettings;
-
-/*@}*/
-
-/**
- * @internal
- * @brief Variable names for the inline version of a function updating result
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef struct UpresVarNames {
- const char *result; /**< Name of an output matrix */
- /** Leading dimension of a matrix stored in the global memory */
- const char *ld;
- const char *startRow; /**< Start row to update from */
- const char *startCol; /**< Start column to update from */
- const char *nrRows; /**< Number of rows */
- const char *nrCols; /**< Number of columns */
- const char *cachedName; /**< Name of lds chached values */
-} UpresVarNames;
-
-/**
- * @internal
- * @brief Options for matrix tiles multiplication generator
- * @ingroup BLAS_MAJOR_SUBGENS
- */
-typedef struct TileMulOpts {
- CLMemType memA; /**< type of memory matrix A is located on */
- CLMemType memB; /**< type of memory matrix B is located on */
- TileMulFlags flags; /**< Flags on objects and computing specifics */
- TileMulCore core; /**< Multiply and add core */
- int (*postFetch)(
- struct KgenContext *ctx,
- MatrixRole mrole,
- void *arg); /**< Tile post fetch callback */
- void *postFetchPriv; /**< Postfetch callback's private date */
- struct FetchContext *fctx;
-} TileMulOpts;
-
-typedef struct ZeroFuncs {
- char names[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN];
-} ZeroFuncs;
-
-/**
- * @internal
- * @brief Private data for fetch postprocessing callback
- * @ingroup TAILS_HANDLING
- */
-typedef struct TilePostFetchPrivate {
- BlasFunctionID funcID;
- const BlasGenSettings *gset;
- const char *regName;
- int fetchNumA;
- int wholeA;
-} TilePostFetchPrivate;
-
-void
-getPrivateAreaInfo(
- const BlasGenSettings *gset,
- BlasFunctionID funcID,
- MatrixRole mrole,
- PrivateArea *area);
-
-void
-declarePrivateArea(
- struct KgenContext *ctx,
- const PrivateArea *area,
- const char *baseName,
- PrivateStorageType storType);
-
-/*
- * Declare separately the real and imaginary part of
- * a complex multiplier.
- *
- * @ctx: generator context
- * @baseName: variable's base name matching to an existing variable
- * with not sepated parts
- * @typeName: variable type name
- *
- * Rule naming
- * real part: <baseName>R
- * imaginary part: <baseName>I
- *
- * On success returns 0, and -EOVERFLOW at source buffer
- * overflowing
- */
-int
-declareComplexMultParts(
- struct KgenContext *ctx,
- const char *baseName,
- const char *typeName);
-
-/**
- * @internal
- * @defgroup CHECK_DECOMP_CACL_GRAN Checking decomposition and calculate
- * parallelism granularity
- * @ingroup BLAS_GENERATORS
- */
-
-/*@{*/
-
-/**
- * @brief Sanity check for decomposition
- *
- * @param[in] subdims Subproblem dimensions. 2 levels.
- * @param[in] minSize Minimum size for any of the dimension
- * components
- * @param[in] maxSize Maxium size which can't be exceeded by
- * any of the dimension components at the tile
- * layer
- * @param[in] maxRegs Maximum registers it's allowed to use
- * @param[in] dtype BLAS data type
- * @param[in] wholeA Is matrix A stored in registers entirely or
- * partially
- *
- * The function rejects only decompositions that are completely invalid or lead
- * to consumption of too many registers or just have component values at the
- * tile layer that are out of the range [\b MinSize, \b MaxSize].
- * Completely invalid decompositions are those which don't allow to divide
- * problem integrally among work items, e. g. zeroed components are wrong,
- * the step components (x, y, bwidth) of the 0-th level not integrally
- * divisible on respective size components (itemX, itemY, bwidth) of the 1-st
- * level are wrong as well. The decomposition is also wrong if the size
- * components are not integrally divisible on the step components and not equal
- * to #SUBDIM_UNUSED.
- *
- * @return true if the decomposition is valid, or false otherwise
- */
-bool
-decompSanityCheck(
- const SubproblemDim *subdims,
- unsigned int minSize,
- unsigned int maxSize,
- unsigned int maxRegs,
- DataType dtype,
- bool wholeA);
-
-/**
- * @brief Calculate granularity in case when a work item is responsible
- * for its own part of solution not overlapping with those of other
- * items
- *
- * @param[out] pgran Location to store calculated granularity
- * @pararm[in] subdims Subproblem dimensions
- * @param[in] xdim Dimension in the OpenCL work space X component
- * of decomposition is mapped on
- * @param[in] level Function BLAS level. Reserved for future use.
- *
- * If value of \b xdim is -1, then the function assumes that OpenCL work
- * space is single dimensional, and puts the product of granularity against
- * X and Y component to 0-th element of \b wgSize field. If its value is
- * 0 or 1, the function assumes that OpenCL work space is 2D and puts
- * granularity against X component to \b xdim element of \b wgSize field
- * of the granularity decriptor. Granularity against Y component is put to
- * 1 - \b xdim element. Other values are invalid and forces abort in debug
- * build. The function initializes the \b wgDim field properly.
- *
- * NOTE: Now, only this function is supported only for level 3 and
- * must not be called for level 2
- */
-void
-calcPgranDedicated(
- PGranularity *pgran,
- const SubproblemDim *subdims,
- int xdim,
- int level);
-
-/**
- * @brief Calculate granularity in case when several items evaluate the same
- * part of solution together
- *
- * @param[out] pgran Location to store calculated granularity
- * @pararm[in] subdims Subproblem dimensions
- * @param[in] xdim Dimension in the OpenCL work space X component
- * of decomposition is mapped on
- * @param[in] ydim Dimension in the OpenCL work space Y component
- * of decomposition is mapped on
- * @param[in] level Function BLAS level. Reserved for future use
- *
- * If \b xdim and \b ydim values are equal, then the function puts the product
- * of granularity against X and Y component to \b xdim element of \b wgSize
- * field. If not, it puts separated granularity for X and Y in \b xdim and
- * \b ydim element respectively. Both the values must be non negative and less
- * than 3 (since OpenCL workspace cannot have more than 3 dimensions).
- * If some of these parameters is zero, then the other one must be zero as well.
- * If some of these parameters is 2, then the other one must be 1. These
- * restrictions are caused by needs in reflecting \b bwidth in granularity
- * in case of multidimensional decomposition. For 2D and 3D decompositions
- * granularity for bwidth is calculated as well, and it is always mapped
- * onto 0-th workspace dimension. If some of these parameters are wrong,
- * it forces abort in debug build. The function sets the \b wgDim field
- * to maximum of xdim and ydim plus 1.
- *
- * NOTE: Now, only this function is supported only for level 3 and
- * must not be called for level 2
- */
-void
-calcPgranCooperative(
- PGranularity *pgran,
- const SubproblemDim *subdims,
- int xdim,
- int ydim,
- int level);
-
-/*@}*/
-
-/**
- * @internal
- * @defgroup COMMON_MATH_OPERATIONS Constructing useful math expression
- * @ingroup BLAS_GENERATORS
- */
-/*@{*/
-
-/**
- * @brief Sprintf a complex MAD operation
- *
- * Operations:
- * - \f$ dst \leftarrow a * b + c \f$
- * - \f$ dst \leftarrow conj(a) * b + c \f$
- * - \f$ dst \leftarrow a * conj(b) + c \f$
- * - \f$ dst \leftarrow conj(a) * conj(b) + c \f$
- *
- * @param[out] expr String object to hold the target expression
- * @param[in] dst Destination argument
- * @param[in] a The first multiplier
- * @param[in] b The second multiplier
- * @param[in] c Added argument
- * @param[in] isDouble If set, the arguments have double precision
- * @param[in] isConjA If set, the argument A should be conjugated
- * @param[in] isConjB If set, the argument B should be conjugated
- * @param[in] TileMulCore Multiplying core
- *
- * The \b c argument can be NULL. In this case it is ignored, and the function
- * produces pure multiplication
- */
-void
-sprintfComplexMulUpdate(
- Kstring *expr,
- const Kstring *dst,
- const Kstring *a,
- const Kstring *b,
- const Kstring *c,
- bool isDouble,
- bool conjA,
- bool conjB,
- TileMulCore core);
-
-void
-sprintfComplexMulUpdate_syr2k_beta0(
- Kstring *expr,
- const Kstring *dst,
- const Kstring *a,
- const Kstring *b,
- const Kstring *c,
- bool isDouble,
- bool conjA,
- bool conjB,
- TileMulCore core);
-
-/**
- * @brief Sprintf expression of fast scalar mad
- *
- * @param[out] expr Output expression
- * @param[in] first First multiplier
- * @param[in] second Second multiplier
- * @param[in] scale Scale of the second argument, i. e. it's divider.
- * Ignored if zero.
- * @param[in] third Added argument. Ignored if NULL.
- *
- * It can use mad24. So, expected result should not exceed 2^24
- */
-void
-sprintfFastScalarMad(
- Kstring *expr,
- const Kstring *first,
- const Kstring *second,
- unsigned int scale,
- const Kstring *third);
-
-/*@}*/
-
-/**
- * @internal
- * @defgroup BLAS_GEN_MISC_FUNCTIONS Miscellaneous functions
- * @ingroup BLAS_GENERATORS
- */
-
-/*@{*/
-
-/**
- * @brief Default function prefix for the data type
- *
- * @param[in] dtype One of the data types supported by the library
- */
-char
-dtypeToBlasPrefix(DataType dtype);
-
-/**
- * @brief Convert kernel extra flags to tilemul flags
- *
- * @param[in] funcID BLAS function ID
- * @param[in] kflags Kernel flags
- */
-TileMulFlags
-kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags);
-
-/**
- * @brief Get vector length elements should be fetched from (stored to)
- * the global memory
- *
- * @param[in] gset Generator settings
- * @param[in] funcID BLAS function ID (deprecated)
- * @param[in] mrole Role of the matrix to get vectorization for
- */
-unsigned int
-getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole);
-
-/**
- * @brief Sprintf chunk (set of components) of an OpenCL vector type
- *
- * @param[out] chunk Buffer to sprintf to
- * @param[in] vecLen Entire vector length
- * @param[in] clen Length of the chunk
- * @param[in] vecOff Starting component offset
- */
-void
-sprintfVecChunk(
- char *chunk,
- unsigned int vecLen,
- unsigned int clen,
- unsigned int vecOff);
-
-/**
- * @brief Generate code containing scaling of leading dimensions on
- * vector size
- *
- * @param[out] ctx Generator context
- * @param[in] gset Generator settings
- *
- * The function first checks whether the scaling is actually needed.
- * If vector size is 1. If some of the kernel variables for 'lda', 'ldb'
- * or 'ldc' is NULL, the function skips code generation for the dimension.
- * Calling this function has no effect if the @ref BGF_LD_IN_VECTORS generator
- * flag is not set. If some of the leading dimensions are not unique, only
- * one of the instances is scaled. Originality of the dimensions is detected
- * by values of the respective pointers being a part of @ref KernelVarNames.
- * For example, 'lda' and 'ldb' pointers are the same, only 'lda' is scaled.
- */
-void
-genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset);
-
-/*@}*/
-
-/**
- * @internal
- * @brief Generate default post processing logic after tile fetch
- *
- * @param[out] ctx Generator context
- * @param[in] mrole Matrix role
- * @priv[out] Handler's private data
- *
- * @ingroup TAILS_HANDLING
- */
-int
-defaultTilePostFetch(
- struct KgenContext *ctx,
- MatrixRole mrole,
- void *priv);
-
-void
-getResultGPRsInfo(
- DataType dtype,
- const SubproblemDim *dims,
- unsigned int vecLen,
- unsigned int *nrRegs,
- const char **typeName);
-
-/**
- * @internal
- * @defgroup BLAS_MAJOR_SUBGENS Major subgenerators
- * @ingroup BLAS_GENERATORS
- */
-/*@{*/
-
-/**
- * @internal
- * @brief Tiles fetching and multiplication inlined code generator
- *
- * @param[out] ctx Generator context
- * @param[in] gset Generator settings
- * @param[in] mulOpts TileMul-specific generator settings
- *
- * This function generates code which fetches tiles a and b from global or local
- * memory into private memory, multiply them storing result into tile c in
- * private memory and increment coordinate k. Caller is responsible for loop
- * along K.\n
- * All combinations of tiles a and b orientations are supported. Generated
- * code fetches tiles by vectors which size can be different for tiles a and b.
- * Complex types and conjugated tiles are supported. Global cycling is supported
- * for global memory fetching - this mean that if tile overlaps matrix
- * the tail of tile will be fetched from the beginning instead of accessing
- * memory outside the matrix.\n
- * Second level of subdimensions is used for tiles sizes.\n
- * Generated code will fetch tiles a, b, multiply them and add result to tile c
- * in private memory, then increment k. By default, k is incremented by
- * second level bwidth but it is incremented by first level bwidth if
- * @ref TILEMUL_BW_STRIDE flag is set. It is used if whole work group goes
- * along K loop.\n
- * Each tile can be fetched from global memory or from local memory.
- * If tile is fetched from local memory then leading dimensions for local
- * memory area are taken from first level subdimensions.\n
- * Post-fetch callback generator function can be called after fetching tiles
- * for zeroing tails or setting diagonal elements to one. This function is
- * provided by caller.\n
- * If second level bwidth is not equal to first level bwidth, and
- * @ref TILEMUL_BW_STRIDE flag is not set then TileMul generates
- * loop from zero to first level bwidth with second level bwidth step. The
- * most common case is second level bwidth equal to first level bwidth where
- * single iteration of multiplication is generated.\n
- *
- * If the caller assume for efficient fetching from the global memory and the
- * tilemul logic is generated within a loop, prepareFetchCycle() should be
- * called before generation of the loop.
- *
- * @return 0 on success
- * @return -EOVERFLOW on source buffer overflowing
- * @return -EINVAL if input arguments are invalid
- */
-int
-tileMulGen(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- const TileMulOpts *mulOpts);
-
-/**
- * @internal
- * @brief Tiles pure multiplication code generator
- *
- * @param[out] ctx Generator context
- * @param[in] gset Generator settings
- * @param[in] mulOpts TileMul-specific generator settings
- *
- * This function multiply two tiles, a and b, storing result in tile c. No
- * additional operations are made. It just performs tiles multiplication without
- * fetching, post-fetch processing and incrementing coordinates which can be
- * made by caller.
- *
- * @return 0 on success
- * @return -EOVERFLOW on source buffer overflowing
- */
-int
-genMulTiles(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- const TileMulOpts *mulOpts);
-
-/**
- * @internal
- * @brief Update result generator
- *
- * @param[out] ctx Generator context
- * @param[in] gset Generator settings
- * @param[in] op Update operation
- * @param[in] flags Update result flags
- * @argNames
- *
- * It generates a function applying an operation to the temporary result
- * stored in the private memory and updating the target result.
- *\n
- * The code can be generated as well in the form of callable function
- * as in the inlined form.
- *\n
- * List of taken argument differs depending on specified flags. In general,
- * these functions are defined as: \n
- * @code
- * void
- * funcName(
- * <input type> C,
- * <output type> *c,
- * <input type> alpha,
- * size_t startRow,
- * size_t startCol,
- * size_t ld
- * [,<input type> beta]
- * [,size_t nrRows]
- * [,size_t nrCols])
- * @endcode
- *
- * @return 0 on success, -EOVERFLOW at source buffer overflowing.
- */
-int
-updateResultGen(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- BlasFunctionID funcId,
- UpdateResultOp op,
- UpdateResultFlags flags,
- const UpresVarNames *uvarNames);
-
-/**
- * @internal
- * @brief Produce a code updating a single result element
- *
- * @param[out] ctx Generator context
- * @param[in] dst Destination element expression
- * @param[in] src Source element expression
- * @param[in] gset Generator settings
- * @param[in] op Update operation
- * @param[in] flags Flags showing specifics of the code needed to be
- * generated
- *
- * @return 0 on success, -EOVERFLOW if the source buffer is exceeded.
- */
-int
-genUpdateResultSingle(
- struct KgenContext *ctx,
- const char *dst,
- const char *src,
- const BlasGenSettings *gset,
- UpdateResultOp op,
- UpdateResultFlags flags);
-
-/*@}*/
-
-TailFetch
-checkForTailFetches(
- BlasFunctionID funcID,
- const SubproblemDim *dim,
- const CLBLASKernExtra *kextra,
- MatrixRole mrole,
- bool distVect,
- bool lowerTails);
-
-bool
-isNeedZeroTileTail(
- BlasFunctionID funcID,
- const SubproblemDim *dim,
- const CLBLASKernExtra *kextra,
- MatrixRole mrole,
- bool distVect);
-
-/**
- * @internal
- * @brief Generate tail coordinates adjustment if needed
- *
- * @param[out] ctx Generator context
- * @param[in] funcID BLAS function ID
- * @param[in] gset Generator settings
- * @param[out] *error Location to store error.
- * Ignored if NULL.
- *
- * Adjust coordinates if work is distributed over matrix rows so as
- * a tile would not exceed the matrix bound. Cyclic addressing is not
- * applicable for that since skew over rows can be used for performance goals.
- *
- * If it's needed, issues an expression like
- *
- * if (coord.y + dy > M) {
- * coord.y -= dy - M % dy;
- * }
- *
- * Return status showing if the tails have been actually adjusted or not.
- * If \b ctx is NULL the function doesn't try to generate a code, but just
- * return actual tail handling status
- *
- * @ingroup TAILS_HANDLING
- */
-TailStatus
-checkGenAdjustTailCoords(
- struct KgenContext *ctx,
- BlasFunctionID funcID,
- const BlasGenSettings *gset,
- int *error);
-
-/**
- * @internal
- * @brief Generate restoring original coordinates if needed
- *
- * @param[out] ctx Generator context
- * @param[in] gset Generator settings
- * @param[in] status Tails handling status
- *
- * Coordinates restoring is needed to have ability to write back result to
- * a correct location.
- *
- * If it's needed, issues an expression like
- *
- * if (coord.y + dy == M) {
- * coord.y += dy - M % dy;
- * }
- *
- * @ingroup TAILS_HANDLING
- */
-int
-checkGenRestoreTailCoords(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- TailStatus status);
-
-/**
- * @internal
- * @brief Convert tail handling status to the respective flags
- * of the update result generator
- *
- * @param[in] status Status of the handling to convert to
- * the update result flags
- *
- * @ingroup TAILS_HANDLING
- */
-UpdateResultFlags
-tailStatusToUpresFlags(TailStatus status);
-
-
-
-#endif /* BLAS_KGEN_H_ */