diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/blas_kgen.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/blas_kgen.c | 1580 |
1 files changed, 0 insertions, 1580 deletions
diff --git a/external/clBLAS/src/library/blas/gens/blas_kgen.c b/external/clBLAS/src/library/blas/gens/blas_kgen.c deleted file mode 100644 index 595fe106..00000000 --- a/external/clBLAS/src/library/blas/gens/blas_kgen.c +++ /dev/null @@ -1,1580 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -/* - * common stuff for blas related - * kernel generators - */ - -#include <string.h> -#include <stdlib.h> -#include <stdio.h> -#include <assert.h> - -#include <list.h> -#include <clblas_stddef.h> - -#include <matrix_props.h> -#include <matrix_dims.h> -#include <dis_warning.h> - -#include "blas_kgen.h" -#include "gen_helper.h" -#include "tile_iter.h" -#include "kerngen.h" - -#define IDX_INVAL ((unsigned int)-1) - -enum { - COORD_STRLEN = 64 -}; - -static unsigned int -getTmpVecLen( - const BlasGenSettings *gset, - UpdateResultFlags uflags, - const char **vecName) -{ - const CLBLASKernExtra *kextra = gset->kextra; - unsigned int vecLen; - - if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC | - UPRES_NO_VECTORIZATION))) { - vecLen = 1; - } - else { - vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC : - kextra->vecLen; - getVectorTypeName(kextra->dtype, vecLen, vecName, NULL); - } - - return vecLen; -} - -/* - * Try to transform kernel string to integer. - * Return -1. If this is not a number. - */ -static int -stringToInt(const char *str, unsigned int *num) -{ - char *end; - unsigned int n; - int ret = -1; - - n = (unsigned int)strtol(str, &end, 10); - // believe it is a number if the string has been parsed completely - if ((end != str) && (*end == '\0')) { - *num = n; - ret = 0; - } - - return ret; -} - -void -sprintfVecChunk( - char *chunk, - unsigned int vecLen, - unsigned int clen, - unsigned int vecOff) -{ - const char *vect = "0123456789abcdef"; - - if (clen == vecLen) { - chunk[0] = '\0'; - } - else { - snprintf(chunk, clen + 3, ".s%s", vect + vecOff); - chunk[clen + 2] = '\0'; - } -} - -unsigned int -getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole) -{ - unsigned int vecLen = 0; - const CLBLASKernExtra *kextra = gset->kextra; - - DUMMY_ARG_USAGE(funcID); - - if (!(gset->flags & BGF_DISTINCT_VECLEN)) { - vecLen = umin(kextra->vecLenA, kextra->vecLenB); - vecLen = umin(vecLen, kextra->vecLenC); - } - else { - switch (mrole) { - case MATRIX_A: - vecLen = kextra->vecLenA; - break; - case MATRIX_B: - vecLen = kextra->vecLenB; - break; - case MATRIX_C: - vecLen = kextra->vecLenC; - break; - default: - break; - } - } - - return vecLen; -} - -void -genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset) -{ - const KernelVarNames *kvars; - unsigned int vecLen; - bool done = false; - - if (!(gset->flags & BGF_LD_IN_VECTORS)) { - return; - } - - kvars = &gset->varNames; - - vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A); - if ((kvars->lda != NULL) && (vecLen > 1)) { - kgenPrintf(ctx, "%s /= %u;\n", kvars->lda, vecLen); - done = true; - } - - vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B); - if ((kvars->ldb != NULL) && (vecLen > 1) && (kvars->ldb != kvars->lda)) { - kgenPrintf(ctx, "%s /= %u;\n", kvars->ldb, vecLen); - done = true; - } - - vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_C); - if ((kvars->ldc != NULL) && (vecLen > 1) && - (kvars->ldc != kvars->lda) && (kvars->ldc != kvars->ldb)) { - - kgenPrintf(ctx, "%s /= %u;\n", kvars->ldc, vecLen); - done = true; - } - - if (done) { - kgenAddBlankLine(ctx); - } -} - -void -getPrivateAreaInfo( - const BlasGenSettings *gset, - BlasFunctionID funcID, - MatrixRole mrole, - PrivateArea *area) -{ - const CLBLASKernExtra *kextra = gset->kextra; - const SubproblemDim *dim = &gset->subdims[1]; - - area->vecLen = getVecLen(gset, funcID, mrole); - getVectorTypeName(kextra->dtype, area->vecLen, &area->typeName, NULL); - if (mrole == MATRIX_C) { - area->size = (unsigned int)(divRoundUp(dim->x, area->vecLen) * dim->y); - } - else { - size_t h = (mrole == MATRIX_A) ? dim->y : dim->x; - - area->size = (unsigned int)(h * dim->bwidth / area->vecLen); - } -} - -void -declarePrivateArea( - struct KgenContext *ctx, - const PrivateArea *area, - const char *baseName, - PrivateStorageType storType) -{ - char tmp[1024]; - unsigned int i; - - // TODO: separate case for size equal to 1 - if (storType == PRIV_STORAGE_ARRAY) { - sprintf(tmp, "%s %s[%u];\n", area->typeName, baseName, area->size); - } - else { - char *p; - - sprintf(tmp, "%s %s0", area->typeName, baseName); - p = tmp + strlen(tmp); - for (i = 1; i < area->size; i++) { - sprintf(p, ", %s%u", baseName, i); - p += strlen(p); - } - strcpy(p, ";\n"); - } - - kgenAddStmt(ctx, tmp); -} - -int -defaultTilePostFetch( - struct KgenContext *ctx, - MatrixRole mrole, - void *priv) -{ - char tmp[1024], cond[128]; - Kstring src; - TilePostFetchPrivate *pfPriv = (TilePostFetchPrivate*)priv; - bool distVect = (pfPriv->gset->flags & BGF_DISTINCT_VECLEN); - const KernelVarNames *vnames = &pfPriv->gset->varNames; - const CLBLASKernExtra *kextra = pfPriv->gset->kextra; - const SubproblemDim *dim = &pfPriv->gset->subdims[1]; - BlasFunctionID funcID = pfPriv->funcID; - const Tile* tile; - bool partA; - unsigned int step; - unsigned int i, j; - int ret = 0; - unsigned int maxJ = 0; - unsigned int maxI = 0; - - if (!isNeedZeroTileTail(funcID, dim, kextra, mrole, distVect)) { - return 0; - } - - if (mrole == MATRIX_A) { - tile = &pfPriv->gset->tileA; - maxJ = tile->nrCols; - maxI = tile->nrRows; - } - else { - tile = &pfPriv->gset->tileBX; - maxJ = tile->nrRows; - maxI = tile->nrCols; - } - - partA = (mrole == MATRIX_A) && tile->trans && - !(pfPriv->gset->flags & BGF_WHOLE_A); - step = tileLineSegmentLen(tile); - step = (tile->trans ^ (mrole == MATRIX_A)) ? 1 : step; - - for (j = 0; (j < maxJ) && !ret; j++) { - unsigned int k; - - k = umax(j, (unsigned int)pfPriv->fetchNumA); - if (k) { - sprintf(tmp, " + %u", k); - } - else { - tmp[0] = '\0'; - } - sprintf(cond, "(%s%s < %s)", vnames->k, tmp, vnames->sizeK); - - for (i = 0; (i < maxI) && !ret; i += step) { - if (mrole != MATRIX_A) { - sprintfTileElement(&src, tile, j, i, step); - } - else { - sprintfTileElement(&src, tile, i, j, step); - } - sprintf(tmp, "%s = %s ? %s : 0;\n", src.buf, cond, src.buf); - ret = kgenAddStmt(ctx, tmp); - } - } - - if (partA) { - pfPriv->fetchNumA++; - } - - if ((tile->nrCols * tile->nrRows / tile->vecLen > 1) && !ret) { - ret = kgenAddBlankLine(ctx); - } - - return ret; -} - -char -dtypeToBlasPrefix(DataType dtype) -{ - char c; - - if (dtype == TYPE_FLOAT) { - c = 's'; - } - else { - c = dtypeToPrefix(dtype); - } - - return c; -} - -TileMulFlags -kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags) -{ - TileMulFlags mf = TILEMUL_NO_FLAGS; - - if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A)) { - mf |= TILEMUL_TRA; - } - if (isMatrixConj(kflags, MATRIX_A)) { - mf |= TILEMUL_CONJA; - } - if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B)) { - mf |= TILEMUL_TRB; - } - if (isMatrixConj(kflags, MATRIX_B)) { - mf |= TILEMUL_CONJB; - } - - return mf; -} - -void -getResultGPRsInfo( - DataType dtype, - const SubproblemDim *dims, - unsigned int vecLen, - unsigned int *nrRegs, - const char **typeName) -{ - if (isComplexType(dtype)) { - if (nrRegs) { - *nrRegs = (unsigned int)(dims->x * dims->y); - } - if (typeName != NULL) { - *typeName = dtypeBuiltinType(dtype); - } - } - else { - // handle different vecLen values and fetch vector sizes - if (nrRegs) { - *nrRegs = (unsigned int)(divRoundUp(dims->x, vecLen) * dims->y); - } - if (typeName != NULL) { - getVectorTypeName(dtype, vecLen, typeName, NULL); - } - } -} - -static void genVectorCPtr( struct KgenContext *pCtx, - const BlasGenSettings *pGSet, - const char* GPtrName, - const char* VCPtrName ) -{ - const char *typeName; - unsigned int vecLen = 0; - - vecLen = getVecLen( pGSet, 0, MATRIX_C ); - vecLen = vecLen > pGSet->tileCY.vecLen ? - pGSet->tileCY.vecLen : - vecLen; - - getVectorTypeName( pGSet->kextra->dtype, - vecLen, - &typeName, - NULL ); - - if ( 0 == (pGSet->flags & BGF_LD_IN_VECTORS) ) { - - vecLen = 1; - } - // Blas function ID is omitted - if ( isComplexType( pGSet->kextra->dtype ) ) { - vecLen *= 2; - } - - if ( isDoubleBasedType(pGSet->kextra->dtype) ) { - - if ( 1 == vecLen ) { - - kgenPrintf( - pCtx, - "__global %s *%s = %s.d;\n", - typeName, - VCPtrName, - GPtrName); - } - else { - - kgenPrintf( pCtx, - "__global %s *%s = %s.d%dv;\n", - typeName, - VCPtrName, - GPtrName, - vecLen); - } - } - else { - - if ( 1 == vecLen ) { - - kgenPrintf( - pCtx, - "__global %s *%s = %s.f;\n", - typeName, - VCPtrName, - GPtrName); - } - else { - - kgenPrintf( pCtx, - "__global %s *%s = %s.f%dv;\n", - typeName, - VCPtrName, - GPtrName, - vecLen); - } - } -} - -static void -updateOptimResultGen( - struct KgenContext *pCtx, - const BlasGenSettings *pGSet, - BlasFunctionID funcID, - UpdateResultOp op, - UpdateResultFlags flags) -{ - KernelExtraFlags kflags = pGSet->kextra->flags; - Tile tempCTile; - Tile fullCTile; - unsigned int physVecLenC; - DataType dtype; - const KernelVarNames *pVNames = NULL; - PhysTileIterator physIter; - PhysTileIterator blkIter; - char cPtrName[] = "pC"; - const char *typeNameC; - bool phyTrans = 0; - unsigned int vecLen = 0; - unsigned int nBlocks = 0; - unsigned int i = 0; - - Kstring cElem; - Kstring tempCElem; - Kstring kstrFirst; - Kstring kstrSecond; - Kstring kstrThird; - Kstring expr; - - //EINVAL - if ( NULL == pCtx || - NULL == pGSet ) { - - return; - } - - dtype = pGSet->kextra->dtype; - pVNames = &pGSet->varNames; - phyTrans = ( (flags & UPRES_COLUMN_MAJOR ) != 0 ); - - physVecLenC = getVecLen( pGSet, funcID, MATRIX_C ); - getVectorTypeName( dtype, - getVecLen( pGSet,0,MATRIX_C ), - &typeNameC, - NULL ); - - // declare private C pointer - genVectorCPtr( pCtx, pGSet, "uC", "pC" ); - - kgenAddBlankLine( pCtx ); - - // calculate the number of blocks, update should be divided on - nBlocks = pGSet->tileCY.nrCols * pGSet->tileCY.nrRows/( - pGSet->tileA.nrCols*pGSet->tileA.nrRows + - pGSet->tileBX.nrCols*pGSet->tileBX.nrRows ); - - if( pGSet->tileCY.nrCols * pGSet->tileCY.nrRows%( - pGSet->tileA.nrCols*pGSet->tileA.nrRows + - pGSet->tileBX.nrCols*pGSet->tileBX.nrRows ) ){ - - nBlocks++; - } - - nBlocks = roundUpPow2( (int)nBlocks ); - - // declare the temporary C tile - // temporary C tile must have the same transposition as C matrix - // for read-write optimization it also has the same vectorization - if ( phyTrans ) { - - if ( nBlocks > pGSet->tileCY.nrCols ) { - nBlocks = pGSet->tileCY.nrCols; - } - - initTile( &tempCTile, - "tempC", - pGSet->tileCY.nrRows, - pGSet->tileCY.nrCols/nBlocks, - pGSet->tileCY.vecLen, - dtype, - PRIV_STORAGE_VARIABLE_SET, - phyTrans, - true ); - - initTile( &fullCTile, - "fullC", - pGSet->tileCY.nrRows, - pGSet->tileCY.nrCols, - pGSet->tileCY.vecLen, - dtype, - PRIV_STORAGE_VARIABLE_SET, - phyTrans, - true); - } - else { - - if ( nBlocks > pGSet->tileCY.nrRows ) { - nBlocks = pGSet->tileCY.nrRows; - } - - initTile( &tempCTile, - "tempC", - pGSet->tileCY.nrRows/nBlocks, - pGSet->tileCY.nrCols, - pGSet->tileCY.vecLen, - dtype, - PRIV_STORAGE_VARIABLE_SET, - phyTrans, - true ); - - initTile( &fullCTile, - "fullC", - pGSet->tileCY.nrRows, - pGSet->tileCY.nrCols, - pGSet->tileCY.vecLen, - dtype, - PRIV_STORAGE_VARIABLE_SET, - phyTrans, - true); - } - - declareOneTileStorage( pCtx, &tempCTile ); - - // splitting update result on several blocks to prevent - // increasing GPR usage - for ( i = 0; i < nBlocks; i++ ) { - - kgenAddBlankLine(pCtx); - - // fetch ------------------------------------------------------------------ - vecLen = umin( physVecLenC, pGSet->tileCY.vecLen ); - vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); - - iterInit( &blkIter, &tempCTile, vecLen, 0 ); - iterInit( &physIter, &fullCTile, vecLen, 0 ); - - iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); - - if (op == UPRES_SUM) { - for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), - iterIterate( &physIter ) ) { - - emptyKstring( &kstrFirst ); - emptyKstring( &kstrSecond ); - emptyKstring( &kstrThird ); - emptyKstring( &cElem ); - emptyKstring( &tempCElem ); - - sprintfTileElement( &tempCElem, - &tempCTile, - blkIter.row, - blkIter.col, - vecLen); - - ksprintf( &kstrFirst, "%d", physIter.line ); - ksprintf( &kstrSecond, "%s", pVNames->ldc ); - ksprintf( &kstrThird, "%d", blkIter.vec ); - - sprintfFastScalarMad( &expr, - &kstrFirst, - &kstrSecond, - vecLen,//physVecLenC,//scale ldc - &kstrThird); - - kgenPrintf( pCtx, - "%s = %s[%s];\n", - tempCElem.buf, - cPtrName, - expr.buf ); - - } - } - - // beta --------------------------------------------------------------- - if ( flags & UPRES_WITH_BETA ) { - - if ( isComplexType(dtype) || - ( pGSet->tileCY.trans != tempCTile.trans ) ) { - vecLen = 1; - } - //TODO: for real datatype find longest available veclen can be used - //to generate more compact code - else { - vecLen = pGSet->tileCY.vecLen; - } - vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); - - iterInit( &blkIter, &tempCTile, vecLen, 0 ); - - for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ) ) { - - sprintfTileElement( &tempCElem, - &tempCTile, - blkIter.row, - blkIter.col, - vecLen); - - if ( isComplexType(dtype) ) { - //complex mad - ksprintf( &kstrSecond, "%s", pVNames->beta ); - sprintfComplexMulUpdate( &expr, - &tempCElem, - &tempCElem, - &kstrSecond, - NULL, - isDoubleBasedType(dtype), - 0, - 0, - 0 ); - kgenPrintf( pCtx, "%s", expr.buf ); - } - else { - if ((kflags & KEXTRA_ENABLE_MAD) != 0) { - kgenPrintf( pCtx, - "%s = mad(%s, %s, 0);\n", - tempCElem.buf, - tempCElem.buf, - pVNames->beta); - } - else { - kgenPrintf( pCtx, - "%s = %s * %s;\n", - tempCElem.buf, - tempCElem.buf, - pVNames->beta); - } - } - } - } - - // alpha--------------------------------------------------------------- - if ( (phyTrans == pGSet->tileCY.trans) && (!isComplexType(dtype)) ) { - - vecLen = pGSet->tileCY.vecLen; - } - else { - vecLen = 1; - } - vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); - - iterInit( &blkIter, &tempCTile, vecLen, 0 ); - iterInit( &physIter, &fullCTile, vecLen, 0 ); - - iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); - - for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), - iterIterate( &physIter) ) { - - const Kstring *dst; - - dst = (flags & UPRES_PRIV_DEST) ? &cElem : &tempCElem; - - sprintfTileElement( &tempCElem, - &tempCTile, - blkIter.row, - blkIter.col, - vecLen); - - sprintfTileElement( &cElem, - &pGSet->tileCY, - physIter.row, - physIter.col, - vecLen); - - // complex - if ( isComplexType(dtype) ) { - - ksprintf( &kstrSecond, "%s", pVNames->alpha ); - - // upres op: sum or set, if set, third argument - // of complex mad() is zero - sprintfComplexMulUpdate( &expr, - dst, - &cElem, - &kstrSecond, - (op == UPRES_SUM) ? &tempCElem : NULL, - isDoubleBasedType(dtype), - 0, - 0, - 0); - kgenPrintf( pCtx, "%s", expr.buf ); - - } - // real - else { - - // upres op: sum or set, if set, third argument - // of mad() is zero - if ((kflags & KEXTRA_ENABLE_MAD) != 0) { - kgenPrintf( pCtx, - "%s = mad(%s, %s, %s);\n", - dst, - cElem.buf, - pVNames->alpha, - (op == UPRES_SUM) ? tempCElem.buf : "0" ); - } - else { - kgenPrintf( pCtx, - "%s = %s * %s + %s;\n", - dst, - cElem.buf, - pVNames->alpha, - (op == UPRES_SUM) ? tempCElem.buf : "0" ); - } - } - } - - if (flags & UPRES_PRIV_DEST) { - return; - } - - // store--------------------------------------------------------------- - vecLen = umin( physVecLenC, pGSet->tileCY.vecLen ); - vecLen = umin( vecLen, tileLineSegmentLen( &tempCTile ) ); - - iterInit( &blkIter, &tempCTile, vecLen, 0 ); - iterInit( &physIter, &fullCTile, vecLen, 0 ); - - iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); - - for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), - iterIterate( &physIter ) ) { - - emptyKstring( &kstrFirst ); - emptyKstring( &kstrSecond ); - emptyKstring( &kstrThird ); - emptyKstring( &cElem ); - emptyKstring( &tempCElem ); - - sprintfTileElement( &tempCElem, - &tempCTile, - blkIter.row, - blkIter.col, - vecLen); - - ksprintf( &kstrFirst, "%d", physIter.line ); - ksprintf( &kstrSecond, "%s", pVNames->ldc ); - ksprintf( &kstrThird, "%d", blkIter.vec ); - - sprintfFastScalarMad( &expr, - &kstrFirst, - &kstrSecond, - vecLen,//physVecLenC,//scale ldc - &kstrThird); - - kgenPrintf( pCtx, - "%s[%s] = %s;\n", - cPtrName, - expr.buf, - tempCElem.buf ); - - } - } - -} - -int -genUpdateResultSingle( - struct KgenContext *ctx, - const char *dst, - const char *src, - const BlasGenSettings *gset, - UpdateResultOp op, - UpdateResultFlags flags) -{ - char tmp[1024]; - char *p; - const char *opStr; - UpdateResultFlags m; - int r; - bool isComplex = isComplexType(gset->kextra->dtype); - - // copy destination with respective operator and additional operations - if (flags & UPRES_WITH_BETA) { - if (isComplex) { - sprintf(tmp, "%s = %s * betaR + %s.yx * betaI + ", - dst, dst, dst); - } - else { - sprintf(tmp, "%s = %s * beta + ", dst, dst); - } - } - else { - opStr = (op == UPRES_SET) ? "=" : "+="; - sprintf(tmp, "%s %s ", dst, opStr); - } - - m = UPRES_WITH_BETA | UPRES_GENERIC; - if (isComplex && ((flags & m) == m)) { - strcat(tmp, "\n "); - } - p = tmp + strlen(tmp); - - // multiply source - if (flags & UPRES_WITHOUT_ALPHA) { - sprintf(p, "%s;\n", src); - } - else { - if (isComplex) { - sprintf(p, "%s * alphaR + %s.yx * alphaI;\n", src, src); - } - else { - sprintf(p, "%s * alpha;\n", src); - } - } - - r = kgenAddStmt(ctx, tmp); - - return (r) ? -EOVERFLOW : 0; -} - -static void -updateGenericResultGen( - struct KgenContext *ctx, - const BlasGenSettings *gset, - size_t pitch, - UpresVarNames* uvars, - UpdateResultOp op, - UpdateResultFlags flags, - const char *cachedName) -{ - char tmp[1024], dst[128], src[128]; - const char *boundNames[2] = {uvars->nrRows, uvars->nrCols}; - const char *vecType = NULL; - const char *vFieldVectorized; - DataType dtype = gset->kextra->dtype; - unsigned int wvlen; - unsigned int sizes[2]; - const char* vfield = dtypeUPtrField(dtype); - bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0); - bool row = ((flags & UPRES_TAIL_ROW)); - bool col = ((flags & UPRES_TAIL_COL)); - bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0) || - (gset->tileCY.storType != PRIV_STORAGE_ARRAY); - int l0; - int l1; - bool revert = false; - - Kstring kstr; - int rowId; - int colId; - - sizes[0] = (unsigned int)gset->subdims[1].y; - sizes[1] = (unsigned int)gset->subdims[1].x; - - if (iwc) { - const char* l0var = boundNames[tra]; - revert = (tra && col) || (!tra && row); - - if (revert) { - sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld); - } - else { - sprintf(tmp, "\n"); - } - kgenAddStmt(ctx, tmp); - - } - wvlen = getTmpVecLen(gset, flags, &vecType); - if (!iwc) { - getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized); - sprintf(tmp, "res.%s = c;\n", vFieldVectorized); - kgenAddStmt(ctx, tmp); - } - - if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) { - char offStr[64]; - char *p = offStr; - - offStr[0] = '\0'; - if (flags & UPRES_TAIL_ROW) { - sprintf(offStr, " + (%u - %s) * %lu", - sizes[0], uvars->nrRows, pitch); - p += strlen(offStr); - } - if (flags & UPRES_TAIL_COL) { - sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols); - } - if (iwc) { - sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr); - sprintf(tmp, "\n"); - } - else { - sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr); - } - kgenAddStmt(ctx, tmp); - - } - if (iwc) { - int l0st = 1; int l0en = sizes[tra]; - int l1st = 1; int l1en = sizes[1-tra]; - - const char* l0var = boundNames[tra]; - const char* l1var = boundNames[1-tra]; - - for (l0 = l0en; l0 >= l0st; l0--) { - - sprintf(tmp, "if (%s) ",l0var); - kgenBeginBranch(ctx, tmp); - - sprintf(tmp, "switch (%s)", l1var); - kgenBeginBranch(ctx, tmp); - - for (l1 = l1en; l1 >= l1st; l1--) { - sprintf(tmp, "case %d:\n", l1); - kgenAddStmt(ctx, tmp); - - if (tra) { - rowId = (row)? (l1en-l1): (l1-l1st); - colId = (col)? (l0-l0st): (l0en-l0); - } - else { - /////////////////////////// - rowId = (row)? (l0-l0st): (l0en-l0); - colId = (col)? (l1en-l1) : (l1-l1st); - } - - if ((tra && row) || (!tra && col)) { - sprintf(dst, "uC.%s[(%s+%d) %% %i]", - vfield, l1var, (l1en - l1), (int)l1en); - } - else { - sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st)); - } - - sprintfTileElement(&kstr, &gset->tileCY, rowId, colId, wvlen); - - if (flags & UPRES_PRIV_DEST) { - genUpdateResultSingle(ctx, kstr.buf, dst, gset, op, flags); - } - else { - genUpdateResultSingle(ctx, dst, kstr.buf, gset, op, flags); - } - } - kgenEndBranch(ctx, NULL); - - if (revert) { - sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld); - } - else { - sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld); - } - - kgenAddStmt(ctx, tmp); - - sprintf(tmp, "%s--;\n", l0var); - kgenAddStmt(ctx, tmp); - kgenEndBranch(ctx, NULL); - } - - } - else { - sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]); - kgenBeginBranch(ctx, tmp); - sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]); - kgenBeginBranch(ctx, tmp); - sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld); - if (cachedName) { - unsigned int i; - char tmpcachedName[80] = " = "; - strcat(tmpcachedName, cachedName); - for (i = 3; i < strlen(tmpcachedName); i++) { - if (strncmp(tmpcachedName+i, "%u", 2) == 0) { - tmpcachedName[i+1] = 's'; - } - } - sprintf(tmp, tmpcachedName, "i", "[j]"); - strcat(dst, tmp); - } - // result (res) can be transposed independently of the matrix C - // If the transposition of "C" and "result" is not consistent - // then change the calculation of the index for "result" - if (gset->tileCY.trans ^ tra) { - sprintf(src, "res.%s[j * %lu + i]", vfield, pitch); - } - else { - sprintf(src, "res.%s[i * %lu + j]", vfield, pitch); - } - if (flags & UPRES_PRIV_DEST) { - genUpdateResultSingle(ctx, src, dst, gset, op, flags); - } - else { - genUpdateResultSingle(ctx, dst, src, gset, op, flags); - } - kgenEndBranch(ctx, NULL); - kgenEndBranch(ctx, NULL); - } -} - -//----------------------------------------------------------------------------- - -int -updateResultGen( - struct KgenContext *ctx, - const BlasGenSettings *gset, - BlasFunctionID funcID, - UpdateResultOp op, - UpdateResultFlags flags, - const UpresVarNames *uvarNames) -{ - char tmp[1024]; - char *p = tmp; - const char *typeName; - const char *vecType = NULL; - const char *vfield; - const char *suff1; - const char *suff2; - int ret = 0; - unsigned int sizes[2]; - bool generic, tra; - unsigned int wvlen; // length of vectors to copy with - unsigned int uplen; // length of vectors to update result with - size_t pitch; - char LG; - DataType dtype = gset->kextra->dtype; - unsigned int vecLen; - bool isInlined = (flags & UPRES_INLINE); - UpresVarNames uvars; - - vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC : - gset->kextra->vecLen; - sizes[0] = (unsigned int)gset->subdims[1].y; - sizes[1] = (unsigned int)gset->subdims[1].x; - - if (isComplexType(dtype)) { - vecLen = 1; - } - - if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) { - return -EINVAL; - } - - tra = ((flags & UPRES_COLUMN_MAJOR) != 0); - generic = ((flags & UPRES_GENERIC) != 0); - typeName = dtypeBuiltinType(dtype); - vfield = dtypeUPtrField(dtype); - pitch = roundUp(sizes[1], vecLen); - - // select write vectorization - wvlen = getTmpVecLen(gset, flags, &vecType); - uplen = (tra ^ gset->tileCY.trans - || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen; - - suff1 = (generic) ? "Generic" : ""; - suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : ""; - LG = (flags & UPRES_USE_LDS) ? 'L' : 'G'; - - if (!isInlined) { - const char *outTypeName; - const char *memPref = (flags & UPRES_USE_LDS) ? "__local" : - "__global"; - - getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName); - - // define the function - sprintf(tmp, "void\n" - "updateResult%s%s%c(\n" - " %s %s *C,\n" - " %s *c,\n" - " %s alpha,\n" - " uint startRow,\n" - " uint startCol,\n" - " uint ld", - suff1, suff2, LG, memPref, typeName, - outTypeName, typeName); - - p += strlen(p); - if (flags & UPRES_WITH_BETA) { - sprintf(p, ",\n %s beta", typeName); - p += strlen(p); - } - if (generic) { - sprintf(p, ",\n uint nrRows,\n" - " uint nrCols"); - } - - uvars.result = "C"; - uvars.ld = "ld"; - uvars.startRow = "startRow"; - uvars.startCol = "startCol"; - uvars.nrRows = "nrRows"; - uvars.nrCols = "nrCols"; - - strcat(p, ")\n"); - kgenDeclareFunction(ctx, tmp); - kgenBeginFuncBody(ctx); - } - else { - memcpy(&uvars, uvarNames, sizeof(uvars)); - } - - // declare local variables - sprintf(tmp, "%cPtr uC;\n", LG); - kgenAddStmt(ctx, tmp); - if (generic) { - kgenAddStmt(ctx, "int i, j;\n" - "PPtr res;\n"); - } - else { - /* - * temporary pointer to pass correctly over the - * destination array since destination rows can be - * not aligned on a vector bound - */ - if (sizes[1 - tra] % wvlen != 0) { - sprintf(tmp, "%cPtr tmpC;\n", LG); - kgenAddStmt(ctx, tmp); - } - if (wvlen > uplen) { - sprintf(tmp, "%s tmp;\n", vecType); - kgenAddStmt(ctx, tmp); - } - } - if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) { - declareComplexMultParts(ctx, "alpha", typeName); - if (flags & UPRES_WITH_BETA) { - declareComplexMultParts(ctx, "beta", typeName); - } - - } - kgenAddBlankLine(ctx); - - // LD is scaled - if ( gset->flags & BGF_LD_IN_VECTORS ) { - - vecLen = getVecLen(gset, 0, MATRIX_C); - } - else { - - vecLen = 1; - } - - if (tra) { - - if ( vecLen > 1 ) { - - sprintf(tmp, - "uC.%s = %s + (%s * %s + %s)/%d;\n", - vfield, - uvars.result, - uvars.startCol, - uvars.ld, - uvars.startRow, - vecLen); - } - else { - - sprintf(tmp, - "uC.%s = %s + %s * %s + %s;\n", - vfield, - uvars.result, - uvars.startCol, - uvars.ld, - uvars.startRow); - } - } - else { - - if ( vecLen > 1 ) { - - sprintf(tmp, - "uC.%s = %s + (%s * %s + %s)/%d;\n", - vfield, - uvars.result, - uvars.startRow, - uvars.ld, - uvars.startCol, - vecLen); - - } - else { - - sprintf(tmp, - "uC.%s = %s + %s * %s + %s;\n", - vfield, - uvars.result, - uvars.startRow, - uvars.ld, - uvars.startCol); - } - } - kgenAddStmt(ctx, tmp); - - if ((sizes[1 - tra] % wvlen != 0) && !generic) { - kgenAddStmt(ctx, "tmpC = uC;\n"); - } - ret = kgenAddBlankLine(ctx); - - if (generic) { - updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags, - uvarNames ? uvarNames->cachedName : NULL); - } - else { - updateOptimResultGen(ctx, - gset, - funcID, - op, - flags); - } - - if (!isInlined) { - ret = kgenEndFuncBody(ctx); - } - - return (ret) ? -EOVERFLOW : 0; -} - -TailFetch -checkForTailFetches( - BlasFunctionID funcID, - const SubproblemDim *dim, - const CLBLASKernExtra *kextra, - MatrixRole mrole, - bool distVect, - bool lowerTails) -{ - TailFetch ret = FETCH_NO_TAILS; - size_t x; - KernelExtraFlags tailFlag; - unsigned int vecLen; - KernelExtraFlags tailFlagM, tailFlagN, tailFlagK; - - tailFlagM = lowerTails ? KEXTRA_TAILS_M_LOWER : KEXTRA_TAILS_M; - tailFlagN = lowerTails ? KEXTRA_TAILS_N_LOWER : KEXTRA_TAILS_N; - tailFlagK = lowerTails ? KEXTRA_TAILS_K_LOWER : KEXTRA_TAILS_K; - - if (mrole == MATRIX_A) { - x = dim->y; - tailFlag = tailFlagM; - vecLen = (distVect) ? kextra->vecLenA : kextra->vecLen; - } - else { - x = dim->x; - tailFlag = tailFlagN; - vecLen = (distVect) ? kextra->vecLenB : kextra->vecLen; - } - - if (isMatrixAccessColMaj(funcID, kextra->flags, mrole)) { - if ((kextra->flags & tailFlag) && (x != vecLen)) { - ret |= FETCH_TAIL_COL; - } - if (kextra->flags & tailFlagK) { - ret |= FETCH_TAIL_ROW; - } - } - else if (kextra->flags & tailFlagK) { - ret |= FETCH_TAIL_COL; - } - - return ret; -} - -bool -isNeedZeroTileTail( - BlasFunctionID funcID, - const SubproblemDim *dim, - const CLBLASKernExtra *kextra, - MatrixRole mrole, - bool distVect) -{ - bool trans; - TailFetch tf; - - trans = isMatrixAccessColMaj(funcID, kextra->flags, mrole); - tf = checkForTailFetches(funcID, dim, kextra, mrole, distVect, true); - - return (trans && (tf & FETCH_TAIL_ROW)) || - (!trans && (tf & FETCH_TAIL_COL)); -} - -TailStatus -checkGenAdjustTailCoords( - struct KgenContext *ctx, - BlasFunctionID funcID, - const BlasGenSettings *gset, - int *error) -{ - char tmp[1024]; - const SubproblemDim *dim = &gset->subdims[1]; - const KernelVarNames *varNames = &gset->varNames; - KernelExtraFlags kflags = gset->kextra->flags; - TailStatus status = 0; - int err = 0; - int n = 0; - - if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_A) && - (kflags & KEXTRA_TAILS_M_LOWER)) { - - status |= TAIL_A_RAISED; - sprintf(tmp, "if (%s + %lu > %s) {\n" - " %s -= %lu - %s %% %lu;\n" - "}\n", - varNames->coordA, dim->y, varNames->sizeM, - varNames->coordA, dim->y, varNames->sizeM, - dim->y); - if (ctx != NULL) { - err = kgenAddStmt(ctx, tmp); - n++; - } - } - - if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B) && - (kflags & KEXTRA_TAILS_N_LOWER) && !err) { - - status |= TAIL_B_RAISED; - sprintf(tmp, "if (%s + %lu > %s) {\n" - " %s -= %lu - %s %% %lu;\n" - "}\n", - varNames->coordB, dim->x, varNames->sizeN, - varNames->coordB, dim->x, varNames->sizeN, - dim->x); - if (ctx != NULL) { - err = kgenAddStmt(ctx, tmp); - n++; - } - } - - if (n && !err) { - err = kgenAddBlankLine(ctx); - } - - if (error != NULL) { - *error = err; - } - - return status; -} - -int -checkGenRestoreTailCoords( - struct KgenContext *ctx, - const BlasGenSettings *gset, - TailStatus status) -{ - char tmp[1024]; - const SubproblemDim *dim = &gset->subdims[1]; - const KernelVarNames *varNames = &gset->varNames; - int ret = 0; - int n = 0; - - if (status & TAIL_A_RAISED) { - sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n" - " %s += %lu - %s %% %lu;\n" - "}\n", - varNames->coordA, dim->y, varNames->sizeM, - varNames->sizeM, dim->y, varNames->coordA, - dim->y, varNames->sizeM, dim->y); - ret = kgenAddStmt(ctx, tmp); - n++; - } - - if ((status & TAIL_B_RAISED) && !ret) { - - sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n" - " %s += %lu - %s %% %lu;\n" - "}\n", - varNames->coordB, dim->x, varNames->sizeN, - varNames->sizeN, dim->x, varNames->coordB, - dim->x, varNames->sizeN, dim->x); - kgenAddStmt(ctx, tmp); - n++; - } - - if (n) { - ret = kgenAddBlankLine(ctx); - } - - return (ret) ? -EOVERFLOW : 0; -} - -UpdateResultFlags -tailStatusToUpresFlags(TailStatus status) -{ - UpdateResultFlags flags = 0; - - if (status & TAIL_A_RAISED) { - flags |= UPRES_TAIL_ROW; - } - if (status & TAIL_B_RAISED) { - flags |= UPRES_TAIL_COL; - } - - return flags; -} - -int -declareComplexMultParts( - struct KgenContext *ctx, - const char *baseName, - const char *typeName) -{ - char tmp[1024]; - int r; - - sprintf(tmp, "%s %sR = (%s)(%s.x);\n" - "%s %sI = (%s)(-%s.y, %s.y);\n", - typeName, baseName, typeName, baseName, - typeName, baseName, typeName, baseName, baseName); - r = kgenAddStmt(ctx, tmp); - - return (r) ? -EOVERFLOW : 0; -} - -void -sprintfFastScalarMad( - Kstring *expr, - const Kstring *first, - const Kstring *second, - unsigned int scale, - const Kstring *third) -{ - unsigned int u1 = 0, u2 = 0, u3 = 0; - bool isNum1, isNum2, isNum3; - int shift; - bool done = false; - const char *thirdStr; - const char *suff3; - - // clear up what are these arguments - if (isKstringEmpty(first)) { - isNum1 = true; - } - else { - isNum1 = !stringToInt(first->buf, &u1); - } - - if (isKstringEmpty(second)) { - isNum2 = true; - } - else { - isNum2 = !stringToInt(second->buf, &u2); - } - - if (!scale) { - scale = 1; - } - - if ((third == NULL) || isKstringEmpty(third)) { - thirdStr = "0"; - isNum3 = true; - } - else { - thirdStr = third->buf; - isNum3 = !stringToInt(thirdStr, &u3); - } - suff3 = (isNum3) ? "u" : ""; - - // singular case at which only the third component can contribute - if ( (isNum1 && (u1 == 0)) || - (isNum2 && (u2 /scale == 0))) { - - kstrcpy(expr, thirdStr); - return; - } - - if (isNum1 && isNum2) { - if (isNum3) { - ksprintf(expr, "%u", u1 * u2 / scale + u3); - } - else { - ksprintf(expr, "%u + %s", u1 * u2 / scale, thirdStr); - } - done = true; - } - else if (isNum1) { - /* - * If the third argument is not used, then try to build the expression - * using only shifts if 'scale' and the 'second argument' are both of - * power of 2. Otherwise use mad24. - */ - if (isRoundedPow2(u1) && isRoundedPow2(scale)) { - shift = findHighestSetBit(scale) - findHighestSetBit(u1); - if (isNum3 && (u3 == 0)) { - if (shift < 0) { - ksprintf(expr, "(%s << %d)", second->buf, -shift); - } - else if (shift > 0) { - ksprintf(expr, "(%s >> %d)", second->buf, shift); - } - else { - kstrcpy(expr, second->buf); - } - } - else if (shift > 0) { - ksprintf(expr, "(%s >> %d) + %s", - second->buf, shift, thirdStr); - } - else if (shift == 0) { - ksprintf(expr, "%s + %s", second->buf, thirdStr); - } - else { - ksprintf(expr, "mad24(%uu, %s, %s%s)", - 1u << -shift, second->buf, thirdStr, suff3); - } - done = true; - } - } - - if (!done) { - /* - * Append unsiged suffixes to avoid cases at which one - * operand is signed and the other is unsigned. Typically, - * OpenCL compilers are strict and reject such expressions. - */ - if (isNum2) { - if (u2 / scale == 1) { - if (isNum3 && (u3 == 0)) { - kstrcpy(expr, first->buf); - } - else { - ksprintf(expr, "%s + %s", first->buf, thirdStr); - } - } - else { - ksprintf(expr, "mad24(%s, %uu, %s%s)", - first->buf, u2 / scale, thirdStr, suff3); - } - } - else { - const char *suff1 = (isNum1) ? "u" : ""; - Kstring tmp; - const char *p = NULL; - - if (scale == 1) { - p = second->buf; - } - else { - p = tmp.buf; - if (isRoundedPow2(scale)) { - shift = findHighestSetBit(scale); - ksprintf(&tmp, "(%s >> %d)", second->buf, shift); - } - else { - ksprintf(&tmp, "%s / %d", second->buf, scale); - } - } - - ksprintf(expr, "mad24(%s%s, %s, %s%s)", - first->buf, suff1, p, thirdStr, suff3); - } - } -} |