summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/fetch.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/fetch.c')
-rw-r--r--external/clBLAS/src/library/blas/gens/fetch.c2190
1 files changed, 0 insertions, 2190 deletions
diff --git a/external/clBLAS/src/library/blas/gens/fetch.c b/external/clBLAS/src/library/blas/gens/fetch.c
deleted file mode 100644
index 34d62f17..00000000
--- a/external/clBLAS/src/library/blas/gens/fetch.c
+++ /dev/null
@@ -1,2190 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * COMMON DESCRIPTION:
- *
- * This module implements generation of fetches from memory to registers.
- * It support various optimization strategies depending on used addressing
- * modes, size of tiles, etc. Such a strategy is provided by an object
- * that is named addressing agent.
- *
- * The module supports explicit statements repordering so as to group together
- * scattered ALU and FETCH statements. The reordering is implemented by means
- * of the statement batch. Scheme of priority assignment for statements put
- * to the batch within the same call:
- * - Statments declaring and initializing variables have the highest
- * priority because all the sebsequent ones depend on it.
- * - Fetch statements have the decreased priority if any preparative
- * statements have really been generated
- * - Statements for updating variables have more decreased priority
- * - If an updating variable statement has been generated before full
- * tile fetch completion, priority for the next fetch statement is
- * decreased so as to don't disturb statements dependency.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include <clblas_stddef.h>
-#include <solution_seq.h>
-#include <trace_malloc.h>
-
-#include "blas_kgen.h"
-
-#define MAX_LENGTH 4096
-#define BITS_INT (sizeof(int) * 8)
-
-struct FetchContext;
-
-enum {
- MAX_AUXILIARY_VARNUM = 32,
- MAX_ADDR_AGENTS = 8,
- ADDR_AGENT_PRIVATE_SIZE = 64,
- /*
- * buffer size enough to fit a declaration of a vectorized coordinate,
- * expressions for all components, operators for building a correct syntax
- * construction, and blanks between 2 adjacent component initializers
- */
- COORD_BUFSIZE = (MAX_OPENCL_VECTOR_LENGTH + 1) * (sizeof(Kstring) + 2) + 16,
- /*
- * Priority of all statement declaring and initializing some variables
- */
- PREPARE_VARS_STMT_PRIORITY = 0,
- GENERIC_OPT_LEVELS = FOPTLEV_PREFETCH |
- FOPTLEV_CAN_SHARE_TMP_AB |
- FOPTLEV_MERGE_FETCHES
-};
-
-/*
- * Agent for some addressing scheme. Incapsulates creation and updating
- * of auxiliary variables and building offset expressions
- */
-typedef struct AddrAgent {
- Kstring vars[MAX_AUXILIARY_VARNUM];
- // usage counters for using for A and B
- int usageCount[2];
- // loop preparation counters for A and B
- int loopPrepCount[2];
- char priv[ADDR_AGENT_PRIVATE_SIZE];
-
- bool (*match)(const struct FetchContext*);
- /*
- * Generate code preparing needed variables. Must return 1 if some
- * variables has been actually prepared, 0 otherwise
- */
- int (*prepareVars)(struct FetchContext*);
- /*
- * Generate code updating variables. Must return 1 if some variables
- * has been actually prepared, 0 otherwise.
- * 'stmtPriority' means the priority that must have a statement that
- * is the agent is going to add to the batch
- */
- int (*updateVars)(struct FetchContext*, unsigned int nextLine,
- unsigned int nextVec, int stmtPriority);
- void (*sprintfAddrOffset)(Kstring*, struct FetchContext*,
- unsigned int line, unsigned int vec);
-} AddressingAgent;
-
-// Preperties of the current operation of offset evaluation.
-struct OffsetEvalProps {
- // global size K is in vectors
- bool gkInVect;
- // all coordinates are in vectors
- bool coordInVect;
- /*
- * don't multiply coordinate in the second physical dimension
- * on leading dimension, it is already done
- */
- bool ldNotMul;
- /*
- * Vector length of linear component in leading dimension.
- * Number of linear coordinates in the leading dimension taken
- * by an addressing agent at a time at offset evaluation must be
- * equal to this number.
- */
- unsigned int leadVecLen;
-};
-
-typedef struct FetchContext {
- // addressing mode that should be used in fetch operations
- FetchAddrMode addrMode;
- // optimization levels of code generation
- FetchOptLevel optLevels;
- AddressingAgent agents[MAX_ADDR_AGENTS];
- AddressingAgent *currAgent;
- AddressingAgent *prevAgent;
- const BlasGenSettings *gset;
- const FetchOpts *fopts;
- // statement batch used at the current generation
- struct StatementBatch *batch;
- // Respective physical tile in global memory
- Tile physTile;
- // physical dimension passed in the outer loop
- int outerDim;
- struct OffsetEvalProps oevp;
- bool isLoopPreparation;
- // markers of context validity for matrix A and B
- bool valid[2];
-} FetchContext;
-
-struct PhysOffsetComponents {
- Kstring base;
- Kstring offset;
- Kstring bound;
-};
-
-/*
- * Raw leading dimension. This a pair of a leading dimension
- * expressed in number of elements and value on with which it
- * should be scaled for correct addressing.
- * Scale set to '0' means that the value in elements matches the
- * value in vectors
- */
-struct RawLD {
- Kstring str;
- unsigned int scale;
-};
-
-static const char *vectComponents = "0123456789abcdef";
-
-static void sprintfOffsetStateless(Kstring *expr, FetchContext *fctx,
- unsigned int line, unsigned int vec);
-
-static void initStatelessAgent(AddressingAgent *agent);
-static void initTmpCoordAgent(AddressingAgent *agent);
-static void initPersCoordAgent(AddressingAgent *agent);
-
-void (*initAgentsTable[])(AddressingAgent *agent) = {
- initStatelessAgent,
- initTmpCoordAgent,
- initPersCoordAgent,
- NULL
-};
-
-static __inline bool
-isOne(const Kstring *kstr)
-{
- return (kstr->buf[0] == '1') && (kstr->buf[1] == '\0');
-}
-
-static __inline bool
-isZero(const Kstring *kstr)
-{
- return (kstr->buf[0] == '0') && (kstr->buf[1] == '\0');
-}
-
-static __inline bool
-isLocalMemoryUsed(const FetchOpts *fopts)
-{
- return ((fopts->mrole == MATRIX_A) &&
- (fopts->memA == CLMEM_LOCAL_MEMORY)) ||
- ((fopts->mrole == MATRIX_B) &&
- (fopts->memB == CLMEM_LOCAL_MEMORY));
-}
-
-static __inline unsigned int
-tileVecColsNum(const Tile *physTile)
-{
- return physTile->nrCols / physTile->vecLen;
-}
-
-static __inline bool
-canBeFetchesMerged(const FetchContext *fctx)
-{
- return (fctx->optLevels & FOPTLEV_MERGE_FETCHES) != 0;
-}
-
-/*
- * Returns if the linear offsets along the dimension K
- * can be shared for tiles A and B
- */
-static bool
-canBeKoffShared(const FetchContext *fctx)
-{
- unsigned int vlenA, vlenB;
- bool canShare;
-
- vlenA = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_A);
- vlenB = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_B);
-
- canShare = !fctx->gset->tileA.trans && fctx->gset->tileBX.trans &&
- (vlenA == vlenB);
- canShare = canShare &&
- (fctx->currAgent == fctx->prevAgent) &&
- ((fctx->optLevels & FOPTLEV_CAN_SHARE_TMP_AB) != 0);
-
- return canShare;
-}
-
-static __inline
-const Tile* getDstTile(const FetchContext *fctx)
-{
- return (fctx->fopts->mrole == MATRIX_A) ? &fctx->gset->tileA :
- &fctx->gset->tileBX;
-}
-
-static __inline bool
-isFetchContextValid(const FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- return fctx->valid[i];
-}
-
-static __inline void
-invalidateFetchContext(FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- fctx->valid[i] = false;
-}
-
-static __inline int
-agentUsageCount(const FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- return fctx->currAgent->usageCount[i];
-}
-
-static __inline void
-incAgentUsageCount(FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- fctx->currAgent->usageCount[i]++;
-}
-
-static __inline int
-agentLoopPrepCount(const FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- return fctx->currAgent->loopPrepCount[i];
-}
-
-static __inline void
-incAgentLoopPrepCount(FetchContext *fctx)
-{
- int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1;
-
- fctx->currAgent->loopPrepCount[i]++;
-}
-
-static int
-bwidthPhysDimension(const FetchContext *fctx)
-{
- int dim;
- const Tile *tile;
-
- tile = getDstTile(fctx);
- if (fctx->fopts->mrole == MATRIX_A) {
- dim = (tile->trans) ? 1 : 0;
- }
- else {
- dim = (tile->trans) ? 0 : 1;
- }
-
- return dim;
-}
-
-static FetchAddrMode
-fetchAddrModeFromMulOpts(const TileMulOpts *mulOpts)
-{
- FetchAddrMode mode = FETCH_ADDR_NORMAL;
- TileMulFlags mflags = mulOpts->flags;
-
- if (mflags & (TILEMUL_SKEW_A | TILEMUL_GLOBAL_CYCLIC_A)) {
- mode |= FETCH_ADDR_A_CYCLICAL;
- }
- if (mflags & (TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_B)) {
- mode |= FETCH_ADDR_B_CYCLICAL;
- }
- if (mflags & (TILEMUL_SKEW_K | TILEMUL_GLOBAL_CYCLIC_K)) {
- mode |= FETCH_ADDR_K_CYCLICAL;
- }
- if (mflags & TILEMUL_WRAP_AROUND_TAIL) {
- mode |= FETCH_ADDR_TAILK_PADD;
- }
-
- return mode;
-}
-
-static void
-sprintfVectorComponent(
- Kstring *kstr,
- const char *baseName,
- unsigned int n,
- unsigned int maxn)
-{
- assert(n < maxn);
- if (maxn == 1) {
- kstrcpy(kstr, baseName);
- }
- else {
- ksprintf(kstr, "%s.s%c", baseName, vectComponents[n]);
- }
-}
-
-/*
- * sprintf base coordinate and scale it in accordance with
- * used mode and vector length so as it is in vectors
- */
-static void
-sprintfNormalizedBaseCoord(
- Kstring *kstr,
- const char *name,
- int physDim,
- FetchContext *fctx)
-{
- int shift = findHighestSetBit(fctx->physTile.vecLen);
-
- if (physDim || fctx->oevp.coordInVect || (shift == 0)) {
- kstrcpy(kstr, name);
- }
- else {
- ksprintf(kstr, "(uint)(%s >> %d)", name, shift);
- }
-}
-
-static void
-sprintfOffsetVector(Kstring *kstr, unsigned int base, unsigned int len)
-{
- if (len == 1) {
- ksprintf(kstr, "%u", base);
- }
- else {
- unsigned int i;
-
- ksprintf(kstr, "(uint%u)(%u", len, base);
- for (i = 1; i < len; i++) {
- kstrcatf(kstr, ", %u", base + i);
- }
- kstrcatf(kstr, "%c", ')');
- }
-}
-
-static void
-sprintfLinearOffset(
- Kstring *expr,
- const struct PhysOffsetComponents *comp,
- bool swapBaseOff)
-{
- int cnt = 0;
- const Kstring *kstr = NULL;
- bool isBounded;
-
- expr->buf[0] = '\0';
- if (!isKstringEmpty(&comp->base) && !isZero(&comp->base)) {
- cnt++;
- kstr = &comp->base;
- }
- if (!isKstringEmpty(&comp->offset) && !isZero(&comp->offset)) {
- cnt++;
- kstr = &comp->offset;
- }
-
- if (cnt == 0) {
- return;
- }
-
- isBounded = !isKstringEmpty(&comp->bound);
- if (cnt == 2) {
- const Kstring *first = (swapBaseOff) ? &comp->offset : &comp->base;
- const Kstring *second = (swapBaseOff) ? &comp->base : &comp->offset;
-
- if (isBounded) {
- ksprintf(expr, "(%s + %s) %% %s",
- first->buf, second->buf, &comp->bound.buf);
- }
- else {
- ksprintf(expr, "%s + %s", first->buf, second->buf);
- }
- }
- else {
- if (isBounded) {
- ksprintf(expr, "%s %% %s", kstr->buf, &comp->bound.buf);
- }
- else {
- kstrcpy(expr, kstr->buf);
- }
- }
-}
-
-/*
- * Estimate if address offset evaluation will be cheap without any savings.
- * If kxy is 0, then predicate it for the coordinates along the dimension K,
- * otherwise do it for the coordinates along rows of A or columns of B.
- */
-static bool
-estimateOffsetEvalCheap(const FetchContext *fctx, int kxy)
-{
- int kdim;
- unsigned int n;
- const Tile *physTile;
- FetchAddrMode relFlag, cycFlag;
- bool needNorm;
-
- /*
- * Criteria:
- * Evaluation is cheap if addressing is relative or number of
- * elements in this dimension doesn't exceed 2 and no transform
- * to vectors (normalization) or cycling is needed.
- */
-
- kdim = bwidthPhysDimension(fctx);
- physTile = &fctx->physTile;
- needNorm = (physTile->vecLen > 1);
- if (!kxy) {
- n = (kdim) ? physTile->nrRows : tileVecColsNum(physTile);
- relFlag = FETCH_ADDR_K_RELATIVE;
- cycFlag = FETCH_ADDR_K_CYCLICAL;
- needNorm = needNorm && !kdim;
- }
- else {
- MatrixRole mrole = fctx->fopts->mrole;
-
- n = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
- relFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE :
- FETCH_ADDR_B_RELATIVE;
- cycFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
- FETCH_ADDR_B_CYCLICAL;
- needNorm = needNorm && kdim;
- }
-
- return ( (fctx->addrMode & relFlag) ||
- ((n <= 2) && !(needNorm || (fctx->addrMode & cycFlag))) );
-}
-
-/*
- * Predicate if register consumption will be high if the
- * generator request a space for 'nrCoords' coordinates.
- * The 'isPers' argument shows if these are persistent
- * coordinates or not.
- * The 'isSummary' argument shows if this is summary number
- * of coordinates for both the tiles or only for one of
- * the tiles.
- */
-static bool
-predictHighRegConsumption(
- const FetchContext *fctx,
- unsigned int nrCoords,
- bool isPers,
- bool isSummary)
-{
- unsigned int max;
-
- DUMMY_ARG_USAGE(fctx);
-
- // TODO: take into account number of registers consumed by the tiles
- max = (isPers) ? 12 : 16;
- if (isSummary) {
- max *= 2;
- }
-
- return !(nrCoords < max);
-}
-
-static void
-sprintfLeadingDimension(Kstring *ld, const FetchContext *fctx)
-{
- bool done = false;
- const char *varName;
-
- varName = (fctx->fopts->mrole == MATRIX_A) ?
- fctx->gset->varNames.lda : fctx->gset->varNames.ldb;
-
- if (!(fctx->gset->flags & BGF_LD_IN_VECTORS)) {
- int shift;
-
- shift = findHighestSetBit(fctx->physTile.vecLen);
- if (shift != 0) {
- ksprintf(ld, "(uint)(%s >> %d)", varName, shift);
- done = true;
- }
- }
-
- if (!done) {
- kstrcpy(ld, varName);
- }
-}
-
-/*
- * fill raw leading dimension
- */
-static void
-fillRawLD(
- struct RawLD *ld,
- const FetchContext *fctx)
-{
- const char *varName;
-
- varName = (fctx->fopts->mrole == MATRIX_A) ?
- fctx->gset->varNames.lda : fctx->gset->varNames.ldb;
-
- kstrcpy(&ld->str, varName);
-
- ld->scale = (fctx->gset->flags & BGF_LD_IN_VECTORS) ?
- 0 : fctx->physTile.vecLen;
-}
-
-/*
- * Spintf bound for the K component in case of storing a matrix
- * in the global memory
- */
-static void
-sprintfGboundK(Kstring *kstr, const FetchContext *fctx)
-{
- int dim;
- const char *varK = fctx->gset->varNames.sizeK;
- unsigned int vecLen;
- int shift;
-
- vecLen = fctx->physTile.vecLen;
- shift = findHighestSetBit(vecLen);
- dim = bwidthPhysDimension(fctx);
- if (dim || fctx->oevp.gkInVect || (shift == 0)) {
- kstrcpy(kstr, varK);
- }
- else {
- if (fctx->addrMode & FETCH_ADDR_TAILK_PADD) {
- ksprintf(kstr, "(uint)((%s + %u) >> %d)", varK, vecLen - 1, shift);
- }
- else {
- ksprintf(kstr, "(uint)(%s >> %d)", varK, shift);
- }
- }
-}
-
-static void
-selectAddrAgent(FetchContext *fctx)
-{
- unsigned int level;
- FetchOptLevel origLevels;
- FetchOptLevel prefLev, mergeLev;
- int i;
- bool last = false;
-
- prefLev = fctx->optLevels & FOPTLEV_PREFETCH;
- /*
- * The merge level doesn't affect addressing agents in any way.
- * So, clear it for a time so as they wouldn't even know if it
- * is used or not.
- */
- mergeLev = fctx->optLevels & FOPTLEV_MERGE_FETCHES;
- origLevels = fctx->optLevels & ~FOPTLEV_MERGE_FETCHES;
- fctx->currAgent = NULL;
-
- /*
- * Selecting criteria: Any of the agents supporting an optimization level
- * as high as possible which is suitable for these generator settings.
- */
- for (level = 1 << (sizeof(int) * 8 - 1);
- !last && (fctx->currAgent == NULL); level >>= 1) {
-
- last = (level == 0);
- if (!(last || (origLevels & level))) {
- continue;
- }
-
- fctx->optLevels = (FetchOptLevel)level | prefLev;
-
- for (i = 0; i < MAX_ADDR_AGENTS; i++) {
- fctx->currAgent = &fctx->agents[i];
- if (fctx->currAgent->match == NULL) {
- fctx->currAgent = NULL;
- break;
- }
- if (fctx->currAgent->match(fctx)) {
- break;
- }
- fctx->currAgent = NULL;
- }
- }
-
- fctx->optLevels = origLevels | mergeLev;
-
- assert(fctx->currAgent != NULL);
-}
-
-static unsigned int
-persVarDepthK(const FetchContext *fctx, unsigned int maxVarVecLen)
-{
- unsigned int depth = 0;
- unsigned int maxDepth;
- int kdim;
- unsigned int vlen = 0;
- const Tile *physTile = &fctx->physTile;
-
- kdim = bwidthPhysDimension(fctx);
- vlen = tileVectorsNum(physTile);
- vlen = umin(vlen, maxVarVecLen);
-
- if (kdim) {
- depth = vlen / tileVecColsNum(physTile);
- maxDepth = physTile->nrRows;
- }
- else {
- depth = vlen / physTile->nrRows;
- maxDepth = tileVecColsNum(physTile);
- }
-
- /*
- * If the dimension K is traversed in the inner loop, and
- * not all coordinates can be saved, then using persistent
- * coordinates is prohibited because there is no chance to
- * update the vectorized coordinate till the end of the whole
- * tile fetch.
- */
- if ((fctx->outerDim != kdim) && (depth < maxDepth)) {
- depth = 0;
- }
-
- return depth;
-}
-
-static void
-genInitVectCoord(
- FetchContext *fctx,
- const Kstring *name,
- unsigned int lenXY,
- unsigned int depthK,
- bool decl,
- bool isConst)
-{
- const Tile *physTile;
- char buf[COORD_BUFSIZE];
- char *p = NULL;
- unsigned int i, k, lenFull;
- int kdim;
- const char *declPref;
- bool needVect;
- Kstring aoff;
- unsigned int vlen;
- Kstring coordType;
-
- kdim = bwidthPhysDimension(fctx);
- physTile = &fctx->physTile;
- lenFull = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
-
- /*
- * If it makes sense Using vectorization at offset evaluation to
- * avoid extra casting of coordinate in vectors to coordinate in elements
- */
- needVect = decl &&
- ( (!kdim && (depthK > 1) && (lenXY == 1)) ||
- (kdim && (depthK == 1) && (lenXY > 1)) );
- vlen = lenXY * depthK;
-
- // coordinate declarator
- declPref = (isConst) ? "const " : "";
- if (decl) {
- if (vlen == 1) {
- ksprintf(&coordType, "%suint", declPref);
- }
- else {
- ksprintf(&coordType, "%suint%u", declPref, vlen);
- }
- }
-
- // declaration + initialization
- if (needVect || (decl && (vlen == 1))) {
- if (needVect) {
- fctx->oevp.leadVecLen = vlen;
- }
- sprintfOffsetStateless(&aoff, fctx, 0, 0);
- kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
- "%s %s = %s;\n",
- coordType.buf, name->buf, aoff.buf);
- fctx->oevp.leadVecLen = 1;
- }
- else {
- unsigned int n = 0;
-
- if (decl) {
- p = buf + sprintf(buf, "%suint%u %s = {",
- declPref, vlen, name->buf);
- }
-
- for (k = 0; k < depthK; k++) {
- for (i = 0; i < lenXY; i++) {
- unsigned int line, vec;
-
- line = (kdim) ? k : i;
- vec = (kdim) ? i : k;
- sprintfOffsetStateless(&aoff, fctx, line, vec);
- if (decl) {
- const char *pref = (n % 3) ? ", " : "";
-
- p += sprintf(p, "%s%s", pref, aoff.buf);
- // split long lines
- n++;
- if (!(n % 3) && (n != vlen)) {
- p += sprintf(p, "%s", ",\n\t\t");
- }
- }
- else {
- kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
- "%s.s%c = %s;\n",
- name->buf, vectComponents[k * lenFull + i],
- aoff.buf);
- }
- }
- }
-
- if (decl) {
- strcpy(p, "};\n");
- assert(p + 4 < buf + COORD_BUFSIZE);
- kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, buf);
- }
- }
-}
-
-
-/**************** Implement different addressing agents *********************/
-
-/********** Stateless (without precoputing) memory addressing agent *********/
-
-static bool
-matchStateless(const FetchContext *fctx)
-{
- return !(fctx->optLevels & ~GENERIC_OPT_LEVELS);
-}
-
-static void
-sprintfOffsetStateless(
- Kstring *expr,
- FetchContext *fctx,
- unsigned int line,
- unsigned int vec)
-{
- FetchAddrMode addrMode = fctx->addrMode;
- bool isRel; // shows if addressing is relative
- const Tile *physTile;
- bool useLocal;
- int kdim;
- unsigned int i, u;
- struct PhysOffsetComponents comps;
- Kstring leadStr, secStr;
- struct RawLD leadDim;
- bool vectLead;
- bool swap;
- Kstring *kstr;
- const KernelVarNames *kvars = &fctx->gset->varNames;
- unsigned int vecLen;
- unsigned int offVlen;
- const char *p;
- FetchAddrMode amask;
- MatrixRole mrole = fctx->fopts->mrole;
- const SubproblemDim *subdim = fctx->gset->subdims;
-
- emptyKstring(&secStr);
- emptyKstring(&leadStr);
-
- offVlen = fctx->oevp.leadVecLen;
- vectLead = (offVlen > 1);
- physTile = &fctx->physTile;
- vecLen = physTile->vecLen;
-
- kdim = bwidthPhysDimension(fctx);
- useLocal = isLocalMemoryUsed(fctx->fopts);
-
- // fill components relating to X or Y
- memset(&comps, 0, sizeof(comps));
- amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE :
- FETCH_ADDR_B_RELATIVE;
- isRel = ((addrMode & amask) != 0);
-
- // base
- if (!isRel) {
- p = (mrole == MATRIX_A) ? kvars->coordA : kvars->coordB;
- sprintfNormalizedBaseCoord(&comps.base, p, 1 - kdim, fctx);
- }
- // offset
- u = (kdim) ? vec : line;
- i = (kdim) ? offVlen : 1;
- if (u || i) {
- sprintfOffsetVector(&comps.offset, u, i);
- }
- // bound
- amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
- FETCH_ADDR_B_CYCLICAL;
- if (addrMode & amask) {
- if (useLocal || isRel) {
- u = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows;
- ksprintf(&comps.bound, "%u", u);
- }
- else {
- // global bound
- if (kdim) {
- /*
- * For X and Y dimension the single task is to prevent
- * exceeding buffer bounds. Using leading dimension for
- * this is the easiest.
- */
- sprintfLeadingDimension(&comps.bound, fctx);
- }
- else {
- const char *var = (fctx->fopts->mrole == MATRIX_A) ?
- fctx->gset->varNames.sizeM : fctx->gset->varNames.sizeN;
-
- kstrcpy(&comps.bound, var);
- }
- }
- }
-
- kstr = (kdim) ? &leadStr : &secStr;
- swap = kdim && vectLead;
- sprintfLinearOffset(kstr, &comps, swap);
-
-
- // fill components relating to bwidth
- memset(&comps, 0, sizeof(comps));
- isRel = ((addrMode & FETCH_ADDR_K_RELATIVE) != 0);
-
- // base
- if (!isRel) {
- sprintfNormalizedBaseCoord(&comps.base, kvars->k, kdim, fctx);
- }
- // offset
- u = (kdim) ? line : vec;
- i = (kdim) ? 1 : offVlen;
- if (u || i) {
- sprintfOffsetVector(&comps.offset, u, i);
- }
- // bound
- if (addrMode & (FETCH_ADDR_K_CYCLICAL)) {
- if (useLocal || isRel) {
- if (useLocal) {
- u = (unsigned int)subdim->bwidth;
- }
- else {
- u = (kdim) ? physTile->nrRows : tileVecColsNum(physTile);
- }
- ksprintf(&comps.bound, "%u", u);
- }
- else {
- sprintfGboundK(&comps.bound, fctx);
- }
- }
-
- kstr = (kdim) ? &secStr : &leadStr;
- swap = !kdim && vectLead;
- sprintfLinearOffset(kstr, &comps, swap);
-
- if (fctx->oevp.ldNotMul) {
- kstrcpy(&leadDim.str, "1");
- leadDim.scale = 0;
- }
- else if (useLocal) {
- leadDim.scale = 0;
- if (kdim) {
- u = (unsigned int)((mrole == MATRIX_A) ? subdim->y : subdim->x);
- }
- else {
- u = (unsigned int)subdim->bwidth;
- }
- ksprintf(&leadDim.str, "%u", u / vecLen);
- }
- else {
- fillRawLD(&leadDim, fctx);
- }
-
- // Build the full expression
- if (!isKstringEmpty(&leadStr) && vectLead) {
- Kstring tmp;
-
- sprintfFastScalarMad(&tmp, &secStr, &leadDim.str,
- leadDim.scale, NULL);
- if (isZero(&tmp)) {
- kstrcpy(expr, leadStr.buf);
- }
- else {
- ksprintf(expr, "%s + %s", leadStr.buf, tmp.buf);
- }
- }
- else {
- sprintfFastScalarMad(expr, &secStr, &leadDim.str,
- leadDim.scale, &leadStr);
- }
-}
-
-static void
-initStatelessAgent(AddressingAgent *agent)
-{
- memset(agent, 0, sizeof(AddressingAgent));
- agent->match = matchStateless;
- agent->sprintfAddrOffset = sprintfOffsetStateless;
-}
-
-/************* Addressing agent using temporary coordinates ****************/
-
-/*
- * Common approach:
- *
- * Save base offsets along both the physical dimensions so as to just
- * have only one add operation per each further offset evaluation.
- * Prediction of hight register consumption is used to decide how many
- * of offsets for each dimension can be saved.
- * 2 attempts are made. On the first one the maximal number of offsets is
- * tried to be allocated. This number is equal to the number of tile lines
- * or vectors in a line respectively. If this number will adittely cause
- * high register consumption, then only one offset is tried to be allocated.
- * If the situation repeats, then the offsets in this dimension are not saved
- * at all.
- *
- * Next point is that only those offsets are precomputed that are estimated
- * to take a lot of computing resources.
- *
- * In case of cyclical mode in the dimension K it is saved the global
- * size K in vectors.
- *
- * Offsets for A and B along the dimension K are be shared if the
- * caller advice to do that and number of them for A and B is the same.
- */
-
-enum {
- TMP_COORD_AY,
- TMP_COORD_AK,
- TMP_A_VSIZEK,
- TMP_COORD_BX,
- TMP_COORD_BK,
- TMP_B_VSIZEK
-};
-
-/*
- * The structure stores length of vectorized temporary variables storing
- * offsets for matrices A and B along rows/columns and the dimension K.
- */
-typedef struct TmpCoordInfo {
- // vector length of the offset coordinate of A along rows
- unsigned int yaVlen;
- // vector length of the offset coordinate of A along the dimension K
- unsigned int kaVlen;
- // vector length of the offset coordinate of B along columns
- unsigned int xbVlen;
- // vector length of the offset coordinate of B along the dimension K
- unsigned int kbVlen;
- /*
- * shows if the respective coordinates are
- * declared as constants or not
- */
- bool yaIsConst;
- bool kaIsConst;
- bool xbIsConst;
- bool kbIsConst;
-
- // force relative addressing along K for the matrix A
- bool forceRelA;
- // force relative addressign along K for the matrix B
- bool forceRelB;
-} MAY_ALIAS TmpCoordInfo;
-
-static unsigned int
-selectTmpCoordsNum(
- const FetchContext *fctx,
- unsigned int currNum,
- unsigned int reqNum,
- bool canShare)
-{
- if (predictHighRegConsumption(fctx, currNum + reqNum,
- false, canShare)) {
- if (predictHighRegConsumption(fctx, currNum + 1,
- false, canShare)) {
- reqNum = 0;
- }
- else {
- reqNum = 1;
- }
- }
-
- return reqNum;
-}
-
-/*
- * check if such number of temporary coordinates has any sence,
- * i. e. will lead eventually to mode efficient evaluation
- */
-static bool
-tmpNumSanityCheck(
- unsigned int num,
- bool isConst,
- int kxy,
- bool isLoopPrep,
- const FetchContext *fctx)
-{
- unsigned int maxCoords[2];
- int dim;
- bool ret = true;
- const Tile *physTile = &fctx->physTile;
-
- maxCoords[0] = tileVecColsNum(physTile);
- maxCoords[1] = physTile->nrRows;
- dim = bwidthPhysDimension(fctx);
- if (kxy) {
- dim = 1 - dim;
- }
-
- /*
- * Believe it is not reasonable if it is not constant value
- * and used few times. It is also right for constant values along X and Y
- * if they prepared within a loop rather than in advance
- * because the compiler is not able to recognize that those values are
- * not needed to be revaluated at each loop iteration. It is also not
- * reasonable if it is precomputed only one constant value whict doesn't
- * actually simplify evaluating linear coordinates in the same dimension:
- * believe it is so, if there is no vectorization at fetching or addressing
- * is cyclical, or this is a coordinate mapped to the second physical
- * dimension (because neverthless this assumes multiplication on leading
- * dimension)
- */
-
- if (!isConst) {
- ret = (maxCoords[1 - dim] > 2);
- }
- else {
- FetchAddrMode cycMode;
- bool isCycled;
-
- if (!kxy) {
- cycMode = FETCH_ADDR_K_CYCLICAL;
- }
- else {
- ret = (isLoopPrep || (maxCoords[1 - dim] > 1));
- cycMode = (fctx->fopts->mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL :
- FETCH_ADDR_B_CYCLICAL;
- }
-
- ret = ret && (!dim || (num == maxCoords[dim]));
-
- isCycled = ((fctx->addrMode & cycMode) != 0);
- if (!dim) {
- ret = ret && ((num > 1) || (physTile->vecLen > 1) || isCycled);
- }
-
- ret = ret && !(isCycled && (num < maxCoords[dim]));
- }
-
- return ret;
-}
-
-/*
- * Force relative addressing along K or X/Y dimension
- */
-static __inline void
-forceRelativeAddressing(FetchContext *fctx, int kxy)
-{
- if (!kxy) {
- fctx->addrMode |= FETCH_ADDR_K_RELATIVE;
- fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL;
- }
- else {
- fctx->addrMode |= (FETCH_ADDR_A_RELATIVE |
- FETCH_ADDR_B_RELATIVE);
- fctx->addrMode &= ~(FETCH_ADDR_A_CYCLICAL |
- FETCH_ADDR_B_CYCLICAL);
- }
-}
-
-static bool
-matchTmpCoordBased(const FetchContext *fctx)
-{
- bool ret;
-
- if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) !=
- FOPTLEV_TMP_COORD_PRECOMPUTING) {
-
- ret = false;
- }
- else {
- ret = !(estimateOffsetEvalCheap(fctx, 0) &&
- estimateOffsetEvalCheap(fctx, 1));
- }
-
- return ret;
-}
-
-static int
-prepareTmpCoords(FetchContext *fctx)
-{
- FetchAddrMode addrMode = fctx->addrMode;
- Kstring *vars = fctx->currAgent->vars;
- MatrixRole mrole = fctx->fopts->mrole;
- const Tile *physTile;
- const Kstring *kstr;
- TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
- int kdim;
- // for sure known summary number of allocated coordinates
- unsigned int coordsNum = 0;
- unsigned int n;
- unsigned int prepared = 0;
- unsigned int maxCoords[2];
- bool canShare;
- bool isConst;
- bool normBoundK;
- Kstring *boundVars[2] = {&vars[TMP_A_VSIZEK], &vars[TMP_B_VSIZEK]};
- int bvidx; // bound variable index in the previously declared array
- bool skip = false;
-
- /*
- * Believe that number of previously allocated coordinates
- * for the other tile is reliable if the caller advice to share
- * possible variables
- */
- canShare = canBeKoffShared(fctx);
- if (canShare) {
- if (mrole == MATRIX_A) {
- coordsNum = info->xbVlen + info->kbVlen;
- }
- else {
- coordsNum = info->yaVlen + info->kaVlen;
- }
- }
-
- kdim = bwidthPhysDimension(fctx);
- physTile = &fctx->physTile;
- maxCoords[0] = tileVecColsNum(physTile);
- maxCoords[1] = physTile->nrRows;
- normBoundK = !kdim && !isLocalMemoryUsed(fctx->fopts) &&
- (fctx->addrMode & FETCH_ADDR_K_CYCLICAL) &&
- (physTile->vecLen > 1);
-
- n = 0;
- if (!estimateOffsetEvalCheap(fctx, 1)) {
- n = selectTmpCoordsNum(fctx, coordsNum, maxCoords[1 - kdim], canShare);
- isConst = (n == maxCoords[1 - kdim]) || (kdim == fctx->outerDim);
- if (!tmpNumSanityCheck(n, isConst, 1, fctx->isLoopPreparation, fctx)) {
- n = 0;
- }
-
- /*
- * Variable coordinates cannot be prepared before the loop starts.
- * If prepare before loop, the coordinates are considered as persistent
- * for more adequate prediction of register consumption.
- * Check also if if the coordinates for X or Y have been
- * already prepared at the loop preparation stage
- */
- if (fctx->isLoopPreparation) {
- skip = !isConst ||
- predictHighRegConsumption(fctx, coordsNum + n,
- true, canShare);
- }
- else {
- skip = isConst &&
- (agentLoopPrepCount(fctx) > agentUsageCount(fctx));
- }
-
- if (!skip) {
- if (mrole == MATRIX_A) {
- kstrcpy(&vars[TMP_COORD_AY], "ay");
- kstr = &vars[TMP_COORD_AY];
- info->yaIsConst = isConst;
- }
- else {
- kstrcpy(&vars[TMP_COORD_BX], "bx");
- kstr = &vars[TMP_COORD_BX];
- info->xbIsConst = isConst;
- }
-
- if (n) {
- /*
- * There are only needed offsets along rows of A or columns
- * of B. So, ensure that another offset components for A and B
- * don't contribute to the final expression. Setting for them
- * relative and not cycled addressing guarantees that the
- * respective expression will be equal to zero
- */
- forceRelativeAddressing(fctx, 0);
- // fire immediate generating of coordinates declaration
- genInitVectCoord(fctx, kstr, n, 1, true, isConst);
- // restore original addressing mode
- fctx->addrMode = addrMode;
- prepared++;
- }
- }
-
- coordsNum += n;
- }
-
- if (!skip) {
- if (mrole == MATRIX_A) {
- info->yaVlen = n;
- }
- else {
- info->xbVlen = n;
- }
- }
-
- bvidx = (mrole == MATRIX_A) ? 0 : 1;
- if (normBoundK) {
- // global size K in vectors for the cyclical addressing
- if (canShare) {
- kstrcpy(boundVars[bvidx], boundVars[1 - bvidx]->buf);
- }
- else if (fctx->isLoopPreparation ||
- (agentLoopPrepCount(fctx) <= agentUsageCount(fctx))) {
-
- const char *name;
- Kstring boundK;
-
- name = (mrole == MATRIX_A) ? "vKA" : "vKB";
- kstrcpy(boundVars[bvidx], name);
- sprintfGboundK(&boundK, fctx);
- kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY,
- "const uint %s = %s;\n",
- boundVars[bvidx]->buf, boundK.buf);
- prepared++;
- }
- }
- else {
- // clear the bound because it may be already not actual
- emptyKstring(boundVars[bvidx]);
- }
-
- if (!fctx->isLoopPreparation) {
- n = 0;
-
- if (!estimateOffsetEvalCheap(fctx, 0)) {
- unsigned int maxn;
-
- // Ignore sharing if number of needed variables is not equal
- if (canShare) {
- maxn = (mrole == MATRIX_A) ? info->kbVlen : info->kaVlen;
- }
- else {
- maxn = maxCoords[kdim];
- }
- n = selectTmpCoordsNum(fctx, coordsNum, maxn, canShare);
- if (n != maxn) {
- canShare = false;
- }
-
- if (canShare) {
- if (mrole == MATRIX_A) {
- kstrcpy(&vars[TMP_COORD_AK], vars[TMP_COORD_BK].buf);
- info->kaIsConst = info->kbIsConst;
- }
- else {
- kstrcpy(&vars[TMP_COORD_BK], vars[TMP_COORD_AK].buf);
- info->kbIsConst = info->kaIsConst;
- }
- }
- else {
- n = selectTmpCoordsNum(fctx, coordsNum,
- maxCoords[kdim], canShare);
- isConst = (n == maxCoords[kdim]) || (kdim != fctx->outerDim);
- if (!tmpNumSanityCheck(n, isConst, 0, false, fctx)) {
- n = 0;
- }
-
- if (mrole == MATRIX_A) {
- kstrcpy(&vars[TMP_COORD_AK], "ak");
- kstr = &vars[TMP_COORD_AK];
- info->kaIsConst = isConst;
- }
- else {
- kstrcpy(&vars[TMP_COORD_BK], "bk");
- kstr = &vars[TMP_COORD_BK];
- info->kbIsConst = isConst;
- }
-
- if (n) {
- const BlasGenSettings *gset = fctx->gset;
- BlasGenSettings newGset;
-
- // substitute normalized bound K if it has been precomputed
- if (normBoundK) {
- int idx = (mrole == MATRIX_A) ? TMP_A_VSIZEK :
- TMP_B_VSIZEK;
-
- memcpy(&newGset, gset, sizeof(BlasGenSettings));
- newGset.varNames.sizeK = vars[idx].buf;
- fctx->gset = &newGset;
- fctx->oevp.gkInVect = true;
- }
- forceRelativeAddressing(fctx, 1);
- genInitVectCoord(fctx, kstr, 1, n, true, isConst);
- fctx->addrMode = addrMode;
- fctx->oevp.gkInVect = false;
- fctx->gset = gset;
- prepared++;
- }
- }
- }
-
- if (mrole == MATRIX_A) {
- info->kaVlen = n;
- }
- else {
- info->kbVlen = n;
- }
- }
-
- return (prepared != 0);
-}
-
-static int
-updateTmpCoords(
- struct FetchContext *fctx,
- unsigned int nextLine,
- unsigned int nextVec,
- int stmtPriority)
-{
- TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
- const Kstring *var = NULL;
- Kstring *agvars = fctx->currAgent->vars;
- const Tile *physTile = &fctx->physTile;
- int relIdx = 0;
- int ret = 0;
-
- if (!( (nextLine < physTile->nrRows) &&
- (nextVec < tileVecColsNum(physTile)) )) {
-
- return 0;
- }
-
- /*
- * Update not constants coordinates. Only one coordinate for
- * each matrix can be non constant.
- */
- if (fctx->fopts->mrole == MATRIX_A) {
- if ((info->yaVlen == 1) && !info->yaIsConst) {
- var = &agvars[TMP_COORD_AY];
- }
- else if ((info->kaVlen == 1) && !info->kaIsConst) {
- var = &agvars[TMP_COORD_AK];
- relIdx = 1;
- }
- }
- else {
- if ((info->xbVlen == 1) && !info->xbIsConst) {
- var = &agvars[TMP_COORD_BX];
- }
- else if ((info->kbVlen == 1) && !info->kbIsConst) {
- var = &agvars[TMP_COORD_BK];
- relIdx = 1;
- }
- }
-
- if (var != NULL) {
- Kstring offset;
- FetchAddrMode origMode = fctx->addrMode;
-
- /*
- * See the comment for coordinates initialization along X and Y
- * in prepareTmpCoords() to understand why the following is needed
- */
- forceRelativeAddressing(fctx, relIdx);
- sprintfOffsetStateless(&offset, fctx, nextLine, nextVec);
- kgenBatchPrintf(fctx->batch, stmtPriority, "%s = %s;\n",
- var->buf, offset.buf);
- fctx->addrMode = origMode;
- ret = 1;
- }
-
- return ret;
-}
-
-static void
-sprintfTmpCoordBasedOffset(
- Kstring *expr,
- FetchContext *fctx,
- unsigned int line,
- unsigned int vec)
-{
- int kdim;
- const TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv;
- MatrixRole mrole = fctx->fopts->mrole;
- const Kstring *agvars = fctx->currAgent->vars;
- const Kstring *varK, *varXY;
- unsigned int xy, k;
- bool isConstK, isConstXY;
- bool savedK, savedXY;
- unsigned int maxK, maxXY;
- unsigned int idxK, idxXY;
- const BlasGenSettings *gset = fctx->gset;
- BlasGenSettings newGset;
- unsigned int phySizes[2];
- Kstring tmpXY, tmpK;
-
- memcpy(&newGset, gset, sizeof(BlasGenSettings));
- fctx->gset = &newGset;
-
- phySizes[0] = tileVecColsNum(&fctx->physTile);
- phySizes[1] = fctx->physTile.nrRows;
- kdim = bwidthPhysDimension(fctx);
- xy = (kdim) ? vec : line;
- k = (kdim) ? line : vec;
-
- /*
- * If the full set of precomputed coordinates for both the dimensions
- * has been saved, then form the target expression simply as sum of the
- * respective values in the dimensions. If the set is not full, e. g. only
- * the coordinate for the top left tile corner is saved, or no coordinates
- * is saved at all, then substitute kernel variables with respective
- * precomputed values (it there is some for the dimension), select new line
- * and vector accordingly, and invoke sprintf of the stateless agent.
- * At invoking the stateless agent cyclical addressing is disabled for
- * dimension having full set of precomputed coordinates because they
- * already take this into account. Eventually, since precomputed coordinates
- * for the second physical dimension already include multiplication on
- * leading dimension, disable this step for the stateless agent
- */
-
- if (mrole == MATRIX_A) {
- isConstXY = info->yaIsConst;
- maxXY = info->yaVlen;
- varXY = &agvars[TMP_COORD_AY];
- }
- else {
- isConstXY = info->xbIsConst;
- maxXY = info->xbVlen;
- varXY = &agvars[TMP_COORD_BX];
- }
- idxXY = umin(xy, maxXY - 1);
- savedXY = maxXY && (!isConstXY ||
- (xy < maxXY));
-
- if (mrole == MATRIX_A) {
- isConstK = info->kaIsConst;
- maxK = info->kaVlen;
- varK = &agvars[TMP_COORD_AK];
- }
- else {
- isConstK = info->kbIsConst;
- maxK = info->kbVlen;
- varK = &agvars[TMP_COORD_BK];
- }
- idxK = umin(k, maxK - 1);
- savedK = maxK && (!isConstK ||
- (k < maxK));
-
- if (savedXY && savedK) {
- sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY);
- sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK);
- ksprintf(expr, "%s + %s", tmpXY.buf, tmpK.buf);
- }
- else {
- FetchAddrMode origMode = fctx->addrMode;
- unsigned int newLine = line;
- unsigned int newVec = vec;
- KernelVarNames *kvars = &newGset.varNames;
- const char **cname;
-
- if (maxXY) {
- cname = (mrole == MATRIX_A) ? &kvars->coordA : &kvars->coordB;
- sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY);
- *cname = tmpXY.buf;
- if ( savedXY && (!kdim || (maxXY == phySizes[1 - kdim])) ) {
- if (mrole == MATRIX_A) {
- fctx->addrMode &= ~FETCH_ADDR_A_CYCLICAL;
- }
- else {
- fctx->addrMode &= ~FETCH_ADDR_B_CYCLICAL;
- }
- }
-
- if (kdim) {
- newVec = (savedXY) ? 0 : vec;
- fctx->oevp.coordInVect = true;
- }
- else {
- newLine = (savedXY) ? 0 : line;
- }
- }
-
- if (maxK) {
- sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK);
- newGset.varNames.k = tmpK.buf;
- if ( savedK && (kdim || (maxK == phySizes[kdim])) ) {
- fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL;
- }
-
- if (kdim) {
- newLine = (savedK) ? 0 : line;
- }
- else {
- newVec = (savedK) ? 0 : vec;
- fctx->oevp.coordInVect = true;
- }
- }
-
- // Substitute the bound along K if it's needed
- if ((fctx->addrMode & FETCH_ADDR_K_CYCLICAL) &&
- (maxK < phySizes[kdim])) {
-
- varK = (mrole == MATRIX_A) ? &agvars[TMP_A_VSIZEK] :
- &agvars[TMP_B_VSIZEK];
- if (!isKstringEmpty(varK)) {
- newGset.varNames.sizeK = varK->buf;
- fctx->oevp.gkInVect = true;
- }
- }
-
- // Finally disable multiplying on leading dimension
- if ((maxXY && !kdim) || (maxK && kdim)) {
- fctx->oevp.ldNotMul = true;
- }
-
- // let the staless agent doesnt's stand idly by
- sprintfOffsetStateless(expr, fctx, newLine, newVec);
-
- // restore original settings
- fctx->oevp.coordInVect = false;
- fctx->oevp.gkInVect = false;
- fctx->oevp.ldNotMul = false;
- fctx->addrMode = origMode;
- }
-
- fctx->gset = gset;
-}
-
-static void
-initTmpCoordAgent(AddressingAgent *agent)
-{
- memset(agent, 0, sizeof(AddressingAgent));
- agent->match = matchTmpCoordBased;
- agent->prepareVars = prepareTmpCoords;
- agent->updateVars = updateTmpCoords;
- agent->sprintfAddrOffset = sprintfTmpCoordBasedOffset;
-}
-
-/************* Addressing agent using persistent coordinates ***************/
-
-enum {
- PERS_COORD_A,
- PERS_COORD_B,
- MAX_PERS_COORD_VECLEN = 8
-};
-
-typedef struct PersCoordInfo {
- // length of the vectorized coordinate for A
- unsigned int vlenA;
- // length of the vectorized coordinate for B
- unsigned int vlenB;
-} MAY_ALIAS PersCoordInfo;
-
-static unsigned int
-persCoordIdx(
- const Tile *physTile,
- unsigned int line,
- unsigned int vec,
- int kdim)
-{
- unsigned int n;
-
- if ((line == physTile->nrRows) ||
- (vec == tileVecColsNum(physTile))) {
-
- n = tileVectorsNum(physTile);
- }
- else if (kdim) {
- n = line * tileVecColsNum(physTile) + vec;
- }
- else {
- n = vec * physTile->nrRows + line;
- }
-
- return n;
-}
-
-static bool
-matchPersCoordBased(const FetchContext *fctx)
-{
- bool ret;
-
- if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) !=
- FOPTLEV_PERS_COORD_PRECOMPUTING) {
-
- ret = false;
- }
- else {
- unsigned int maxK, depthK;
- int kdim;
-
- ret = !(estimateOffsetEvalCheap(fctx, 0) &&
- estimateOffsetEvalCheap(fctx, 1)) &&
- !isLocalMemoryUsed(fctx->fopts);
- ret = ret && !(fctx->addrMode & (FETCH_ADDR_K_RELATIVE |
- FETCH_ADDR_K_CYCLICAL));
-
- /*
- * Don't use this agent if dimension K is passed in the inner loop
- * and maximum possible number of coordinates is not sufficient to
- * cover the entire tile size in this dimension. Using this agent
- * also makes no sense if even single step along K cannot be covered.
- */
- depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN);
- // take any huge number to know maximum depth along K
- maxK = persVarDepthK(fctx, 16384);
- kdim = bwidthPhysDimension(fctx);
-
- ret = ret && (depthK && ((depthK == maxK) ||
- (fctx->outerDim == kdim)));
- }
-
- return ret;
-}
-
-static int
-preparePersCoords(FetchContext *fctx)
-{
- unsigned int depthK;
- unsigned int n;
- Kstring *var;
- bool decl;
- int kdim;
- PersCoordInfo *info;
- MatrixRole mrole;
-
- if (agentLoopPrepCount(fctx) > agentUsageCount(fctx)) {
- return 0;
- }
-
- info = (PersCoordInfo*)fctx->currAgent->priv;
- mrole = fctx->fopts->mrole;
- if (mrole == MATRIX_A) {
- var = &fctx->currAgent->vars[PERS_COORD_A];
- decl = isKstringEmpty(var);
- if (decl) {
- kstrcpy(var, "vca");
- }
- }
- else {
- var = &fctx->currAgent->vars[PERS_COORD_B];
- decl = isKstringEmpty(var);
- if (decl) {
- kstrcpy(var, "vcb");
- }
- }
-
- kdim = bwidthPhysDimension(fctx);
- n = (kdim) ? tileVecColsNum(&fctx->physTile) : fctx->physTile.nrRows;
- depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN);
- if (mrole == MATRIX_A) {
- info->vlenA = n * depthK;
- }
- else {
- info->vlenB = n * depthK;
- }
-
- genInitVectCoord(fctx, var, n, depthK, decl, false);
-
- return 1;
-}
-
-static int
-updatePersCoords(
- FetchContext *fctx,
- unsigned int nextLine,
- unsigned int nextVec,
- int stmtPriority)
-{
- unsigned int step;
- int kdim;
- struct StatementBatch *batch = fctx->batch;
- const Kstring *var = (fctx->fopts->mrole == MATRIX_A) ?
- &fctx->currAgent->vars[PERS_COORD_A] :
- &fctx->currAgent->vars[PERS_COORD_B];
- unsigned int nextCoord, maxCoords;
- PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv;
- const Tile *physTile;
-
- kdim = bwidthPhysDimension(fctx);
- maxCoords = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB;
- nextCoord = persCoordIdx(&fctx->physTile, nextLine, nextVec, kdim);
- if (nextCoord % maxCoords != 0) {
- return 0;
- }
-
- physTile = &fctx->physTile;
- step = (kdim) ? (maxCoords / tileVecColsNum(physTile)) :
- (maxCoords / physTile->nrRows);
- if (fctx->addrMode & FETCH_ADDR_BW_STRIDE) {
- step *= (unsigned int)fctx->gset->subdims[0].bwidth;
- }
-
- if (kdim) {
- struct RawLD ld;
- Kstring tmp1, tmp2;
-
- fillRawLD(&ld, fctx);
- ksprintf(&tmp1, "%u", step);
- sprintfFastScalarMad(&tmp2, &tmp1, &ld.str, ld.scale, NULL);
- kgenBatchPrintf(batch, stmtPriority, "%s += %s;\n",
- var->buf, tmp2.buf);
- }
- else {
- kgenBatchPrintf(batch, stmtPriority, "%s += %u;\n",
- var->buf, step);
- }
-
- return 1;
-}
-
-static void
-sprintfPersCoordBasedOffset(
- Kstring *kstr,
- FetchContext *fctx,
- unsigned int line,
- unsigned int vec)
-{
- const Kstring *var;
- unsigned int kdim;
- unsigned int idx, maxIdx;
- PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv;
-
- kdim = bwidthPhysDimension(fctx);
- maxIdx = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB;
- idx = persCoordIdx(&fctx->physTile, line, vec, kdim);
-
- var = (fctx->fopts->mrole == MATRIX_A) ?
- &fctx->currAgent->vars[PERS_COORD_A] :
- &fctx->currAgent->vars[PERS_COORD_B];
-
- sprintfVectorComponent(kstr, var->buf, idx % maxIdx, maxIdx);
-}
-
-static void
-initPersCoordAgent(AddressingAgent *agent)
-{
- memset(agent, 0, sizeof(AddressingAgent));
- agent->match = matchPersCoordBased;
- agent->prepareVars = preparePersCoords;
- agent->updateVars = updatePersCoords;
- agent->sprintfAddrOffset = sprintfPersCoordBasedOffset;
-}
-
-/***************************************************************************/
-
-static void
-initPhysTile(FetchContext *fctx)
-{
- MatrixRole mrole = fctx->fopts->mrole;
- const BlasGenSettings *gset = fctx->gset;
- const Tile *dstTile;
- bool trans;
- Tile *physTile = &fctx->physTile;
-
- dstTile = getDstTile(fctx);
- trans = dstTile->trans;
-
- memset(physTile, 0, sizeof(Tile));
- if ((mrole == MATRIX_A) && !(gset->flags & BGF_WHOLE_A)) {
- const SubproblemDim *dim = &gset->subdims[1];
-
- physTile->nrRows = (unsigned int)(trans ? dim->bwidth : dim->y);
- physTile->nrCols = (unsigned int)(trans ? dim->y : dim->bwidth);
- }
- else {
- physTile->nrRows = trans ? dstTile->nrCols : dstTile->nrRows;
- physTile->nrCols = trans ? dstTile->nrRows : dstTile->nrCols;
- }
-
- physTile->vecLen = getVecLen(gset, CLBLAS_GEMM, mrole);
- physTile->baseName = (mrole == MATRIX_A) ? gset->varNames.A :
- gset->varNames.B;
-}
-
-static void
-sprintfPhysTileElement(
- Kstring *elem,
- FetchContext *fctx,
- unsigned int line,
- unsigned int vec)
-{
- Kstring ptr;
- Kstring off;
- const char *varName;
- const BlasGenSettings *gset = fctx->gset;
-
- varName = (fctx->fopts->mrole == MATRIX_A) ? gset->varNames.A :
- gset->varNames.B;
- if (fctx->gset->flags & BGF_UPTRS) {
- const char *ptrName;
-
- getVectorTypeName(gset->kextra->dtype, fctx->physTile.vecLen,
- NULL, &ptrName);
- ksprintf(&ptr, "%s.%s", varName, ptrName);
- }
- else {
- kstrcpy(&ptr, varName);
- }
-
- fctx->currAgent->sprintfAddrOffset(&off, fctx, line, vec);
- ksprintf(elem, "%s[%s]", ptr.buf, off.buf);
-}
-
-static void
-genHandLoad(
- FetchContext *fctx,
- const Tile *dstTile,
- unsigned int lineOffset,
- unsigned int line,
- unsigned int vec,
- unsigned int vecLen,
- int stmtPriority)
-{
- Kstring src, dst;
- unsigned int row, col;
-
- row = (dstTile->trans) ? (vec * vecLen) : line;
- col = (dstTile->trans) ? line : (vec * vecLen);
-
- sprintfPhysTileElement(&src, fctx, line + lineOffset, vec);
- sprintfTileElement(&dst, dstTile, row, col, vecLen);
- kgenBatchPrintf(fctx->batch, stmtPriority,
- "%s = %s;\n", dst.buf, src.buf);
-}
-
-/*
- * Invoke update variable methods if it is presented.
- * Return priority that must be used for subsequent statements.
- * Via the parameter 'priority' the function accept the last used
- * priority level
- */
-static int
-checkGenUpdateVars(
- FetchContext *fctx,
- unsigned int nextLine,
- unsigned int nextVec,
- int priority)
-{
- AddressingAgent *agent = fctx->currAgent;
- const Tile *physTile = &fctx->physTile;
- int nextPrio;
- bool endTile;
-
- endTile = (nextLine == physTile->nrRows) ||
- (nextVec == physTile->nrCols);
- if (endTile) {
- kgenAddStmtToBatch(fctx->batch, priority, "\n");
- }
-
- nextPrio = canBeFetchesMerged(fctx) ? (priority + 1) : priority;
-
- if (agent->updateVars &&
- agent->updateVars(fctx, nextLine, nextVec, nextPrio)) {
-
- if (canBeFetchesMerged(fctx)) {
- priority += 2;
- }
- }
- else if (!endTile && (fctx->fopts->linesNum == 1) &&
- tileVecColsNum(physTile) > 1) {
-
- kgenAddStmtToBatch(fctx->batch, priority, "\n");
- }
-
- return priority;
-}
-
-static void
-doGenFetch(FetchContext *fctx)
-{
- const FetchOpts *fetchOpts = fctx->fopts;
- unsigned int lineOffset = fetchOpts->lineOffset;
- unsigned int linesNumber = fetchOpts->linesNum;
- const Tile *physTile, *dstTile;
- unsigned int i, j;
- // length of vectors the tile will be fetched with
- unsigned int vecLen;
- int priority = PREPARE_VARS_STMT_PRIORITY + 1;
-
- physTile = &fctx->physTile;
- dstTile = getDstTile(fctx);
- vecLen = umin(dstTile->vecLen, physTile->vecLen);
-
- if (fctx->outerDim) {
- for (i = 0; i < linesNumber; i++) {
- for (j = 0; j < physTile->nrCols / vecLen; j++) {
- /*
- * TODO: add ability to use load with vload() depending
- * on some option set
- */
- genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen,
- priority);
- }
- priority = checkGenUpdateVars(fctx, lineOffset + i + 1, 0,
- priority);
- }
- }
- else {
- for (j = 0; j < tileVecColsNum(physTile); j++) {
- for (i = 0; i < linesNumber; i++) {
- genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen,
- priority);
- }
- priority = checkGenUpdateVars(fctx, lineOffset, j + 1,
- priority);
- }
- }
-}
-
-
-struct FetchContext
-*createFetchContext(void)
-{
- FetchContext *fctx;
- int i = 0;
-
- fctx = calloc(1, sizeof(FetchContext));
- if (fctx != NULL) {
- fctx->addrMode = FETCH_ADDR_NORMAL;
- fctx->optLevels = FOPTLEV_TMP_COORD_PRECOMPUTING;
- }
-
- // init addressing agents
- while (initAgentsTable[i] != NULL) {
- initAgentsTable[i](&fctx->agents[i]);
- i++;
- }
-
- fctx->oevp.leadVecLen = 1;
- fctx->outerDim = 1;
-
- return fctx;
-}
-
-void
-destroyFetchContext(struct FetchContext *fctx)
-{
- free(fctx);
-}
-
-FetchOptLevel
-getFetchOptLevels(struct FetchContext *fctx)
-{
- return fctx->optLevels;
-}
-
-void
-enableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels)
-{
- fctx->optLevels |= levels;
-}
-
-void
-disableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels)
-{
- fctx->optLevels &= ~levels;
-}
-
-FetchAddrMode
-getFetchAddrMode(const struct FetchContext *fctx)
-{
- return fctx->addrMode;
-}
-
-void
-setFetchAddrMode(struct FetchContext *fctx, FetchAddrMode mode)
-{
- fctx->addrMode = mode;
-}
-
-FetchAddrMode
-setDefaultFetchAddrMode(
- struct FetchContext *fctx,
- const BlasGenSettings *gset,
- FetchAddrMode mask,
- int tailStatus,
- bool processTailK)
-{
- FetchAddrMode addrMode = fctx->addrMode;
- KernelExtraFlags kflags = gset->kextra->flags;
-
- if ((kflags & KEXTRA_TAILS_M_LOWER) && !(tailStatus & TAIL_A_RAISED)) {
- addrMode &= ~FETCH_ADDR_A_RELATIVE;
- addrMode |= FETCH_ADDR_A_CYCLICAL;
- }
- else {
- addrMode &= ~FETCH_ADDR_A_CYCLICAL;
- addrMode |= FETCH_ADDR_A_RELATIVE;
- }
-
- if ((kflags & KEXTRA_TAILS_N_LOWER) && !(tailStatus & TAIL_B_RAISED)) {
- addrMode &= ~FETCH_ADDR_B_RELATIVE;
- addrMode |= FETCH_ADDR_B_CYCLICAL;
- }
- else {
- addrMode &= ~FETCH_ADDR_B_CYCLICAL;
- addrMode |= FETCH_ADDR_B_RELATIVE;
- }
-
- if (kflags & KEXTRA_TAILS_K_LOWER) {
- addrMode &= ~FETCH_ADDR_K_RELATIVE;
- }
- else {
- addrMode |= FETCH_ADDR_K_RELATIVE;
- }
- if (processTailK) {
- addrMode |= FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD;
- }
- else {
- addrMode &= ~(FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD);
- }
-
- addrMode &= ~mask;
- fctx->addrMode = addrMode;
-
- return addrMode;
-}
-
-int
-prepareFetchLoop(
- struct KgenContext *genCtx,
- struct FetchContext *fetchCtx,
- const BlasGenSettings *gset,
- CLMemType memA,
- CLMemType memB)
-{
- AddressingAgent *agent, *saved;
- FetchOpts fopts;
- int i;
- int ret = 0;
- int cnt = 0;
-
- memset(&fopts, 0, sizeof(FetchOpts));
- fopts.memA = memA;
- fopts.memB = memB;
-
- fetchCtx->fopts = &fopts;
- fetchCtx->gset = gset;
-
- fetchCtx->batch = createStmtBatch();
- if (fetchCtx->batch == NULL) {
- return -ENOMEM;
- }
-
- saved = fetchCtx->prevAgent;
-
- fetchCtx->isLoopPreparation = true;
- for (i = 0; i < 2; i++) {
- fopts.mrole = (i) ? MATRIX_A : MATRIX_B;
- initPhysTile(fetchCtx);
- selectAddrAgent(fetchCtx);
- agent = fetchCtx->currAgent;
- if (agent->prepareVars) {
- if (agent->prepareVars(fetchCtx)) {
- cnt++;
- incAgentLoopPrepCount(fetchCtx);
- /*
- * Substitute previous agent so as the it could
- * know that some variables can be really shared
- * if it is selected again
- */
- fetchCtx->prevAgent = agent;
- }
- }
- }
- fetchCtx->isLoopPreparation = false;
-
- fetchCtx->prevAgent = saved;
-
- if (cnt) {
- flushStmtBatch(genCtx, fetchCtx->batch);
- ret = kgenAddBlankLine(genCtx);
- if (ret) {
- ret = -EOVERFLOW;
- }
- }
-
- destroyStmtBatch(fetchCtx->batch);
- fetchCtx->batch = NULL;
-
- return ret;
-}
-
-void
-revalidateFetchContext(struct FetchContext *fctx, MatrixRole mrole)
-{
- if (fctx->currAgent != NULL) {
- int i = (mrole == MATRIX_A) ? 0 : 1;
-
- fctx->valid[i] = true;
- }
-}
-
-static void
-genFetchCommon(struct FetchContext *fctx)
-{
- if (fctx->fopts->mulOpts) {
- fctx->addrMode = fetchAddrModeFromMulOpts(fctx->fopts->mulOpts);
- }
-
- // prepare needed variables
- if (!isFetchContextValid(fctx)) {
- fctx->prevAgent = fctx->currAgent;
- selectAddrAgent(fctx);
- if (fctx->currAgent->prepareVars &&
- fctx->currAgent->prepareVars(fctx)) {
-
- kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "\n");
- }
- }
-
- // fire fetch generation
- revalidateFetchContext(fctx, fctx->fopts->mrole);
- doGenFetch(fctx);
- incAgentUsageCount(fctx);
- invalidateFetchContext(fctx);
-}
-
-int
-genFetchInputTile(
- struct KgenContext *ctx,
- struct FetchContext *fctx,
- const BlasGenSettings *gset,
- const FetchOpts *fetchOpts)
-{
- int ret;
-
- fctx->batch = createStmtBatch();
- if (fctx->batch == NULL) {
- return -ENOMEM;
- }
-
- fctx->fopts = fetchOpts;
- fctx->gset = gset;
- initPhysTile(fctx);
-
- genFetchCommon(fctx);
- ret = flushStmtBatch(ctx, fctx->batch);
-
- destroyStmtBatch(fctx->batch);
- fctx->batch = NULL;
-
- return (ret) ? -EOVERFLOW : 0;
-}
-
-void
-genFetchInputTileBatch(
- struct StatementBatch *batch,
- struct FetchContext *fctx,
- const struct BlasGenSettings *gset,
- const FetchOpts *fetchOpts)
-{
- fctx->fopts = fetchOpts;
- fctx->gset = gset;
- initPhysTile(fctx);
- fctx->batch = batch;
-
- genFetchCommon(fctx);
- fctx->batch = NULL;
-}