summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/trsm.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/trsm.c')
-rw-r--r--external/clBLAS/src/library/blas/gens/trsm.c1649
1 files changed, 0 insertions, 1649 deletions
diff --git a/external/clBLAS/src/library/blas/gens/trsm.c b/external/clBLAS/src/library/blas/gens/trsm.c
deleted file mode 100644
index a5f4d88a..00000000
--- a/external/clBLAS/src/library/blas/gens/trsm.c
+++ /dev/null
@@ -1,1649 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * TRSM generator with support of cached reads from the global memory
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <clblas_stddef.h>
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-#include <matrix_props.h>
-#include <matrix_dims.h>
-
-#include "dblock_kgen.h"
-#include "kerngen.h"
-#include "blas_kgen.h"
-#include "gen_helper.h"
-#include "trxm_common.h"
-#include "trsm_kgen.h"
-#include "legacy/blas_kgen_legacy.h"
-
-typedef enum LdsUseFlags {
- LDS_NO_USE = 0,
- LDS_USE_LARGE = 0x1,
- LDS_USE_DIAGONAL = 0x2
-} LdsUseFlags;
-
-typedef struct TrsmExtraParams {
- int unrollingFactor;
- unsigned int unrolledTail;
- LdsUseFlags ldsUse;
-} TrsmExtraParams;
-
-enum TrsmStage {
- BLOCK_UPDATE,
- TILE_UPDATE
-};
-
-static CLBLASMpatExtra mpatExtra;
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs);
-
-static SolverFlags
-solverFlags(void);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static void
-fixupArgs(void *args, SubproblemDim *subdims, void *extra);
-
-static bool
-checkCalcDecompDedicated(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- DataType dtype,
- int check);
-
-#if 0
-static int
-getDefaultDecomp(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- void * pArgs);
-#endif
-
-static SolverOps trsmSops = {
- generator,
- assignKargs,
- isFitToLDS,
- NULL,
- NULL,
- NULL,
- NULL,
- solverFlags,
- fixupArgs,
- NULL,//getDefaultDecomp
- checkCalcDecompDedicated,
- NULL,
- NULL
-};
-
-// The struct for storage tails
-typedef struct TileSet
-{
- Tile rectA; // The rectangular tile A for the update loop at stage 1
- Tile squareA; // The square tile for the stage 2
- Tile origB; // The rectangular tile B for the update loop at the stage 1
- Tile bStage2; // The rectangular tile B for the update loop at thestage 2
- Tile bAsSqA; // Descriptor for holding square tile A in the storage of B
- Tile bAsC; // Descriptor for holding tile C in the storage of B
- // the entire tile A matching the storage declared in the kernel
- Tile A;
- // the entire tile B matching the storage declared in the kernel
- Tile B;
-} TileSet;
-
-
-static bool
-useSkewedFetchB(const BlasGenSettings *gset)
-{
- KernelExtraFlags kflags = gset->kextra->flags;
- TrsmExtraParams *extraParams = (TrsmExtraParams*)gset->kextra->solverPriv;
- bool ret = false;
-
- if (extraParams->ldsUse & LDS_USE_LARGE) {
- ret = !isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
- }
-
- return ret;
-}
-
-static void
-restoreTile(Tile* dst, const Tile* src)
-{
- dst->baseName = src->baseName;
- dst->vecLen = src->vecLen;
- dst->storType = src->storType;
-}
-
-static Tile
-substituteTile(Tile* dst, const Tile* src)
-{
- Tile tmp;
-
- restoreTile(&tmp, dst);
- restoreTile(dst, src);
-
- return tmp;
-}
-
-static void
-sprintfInvertedElement(
- Kstring *elem,
- const Tile *tile,
- unsigned int row,
- unsigned int col,
- unsigned int len,
- bool isU)
-{
- if (isU) {
- row = tile->nrRows - row - 1;
- col = tile->nrCols - col - len;
- }
-
- sprintfTileElement(elem, tile, row, col, len);
-}
-
-static void
-genTileInverting(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- const TileSet *tileSet)
-{
- char tmp[1024];
- const CLBLASKernExtra *kextra = gset->kextra;
- KernelExtraFlags kflags = kextra->flags;
- DataType dtype = kextra->dtype;
- const SubproblemDim *dim = &gset->subdims[1];
- unsigned int accLen;
- unsigned int i, j, k;
- Tile srcTile;
- Tile dstTile;
- bool isU, isComplex;
- bool isInlined = gset->flags & BGF_EXPLICIT_INLINE;
- const char* typeNameA;
- const char* typeNameB;
-
- memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile));
- memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile));
-
- getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL);
- getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL);
- isU = isMatrixUpper(kflags);
- isComplex = isComplexType(dtype);
-
- if (isComplex || dstTile.trans) {
- accLen = 1;
- }
- else {
- accLen = umin(srcTile.vecLen, dstTile.vecLen);
- accLen = umin(accLen, srcTile.nrCols);
- }
-
- if (!isInlined) {
- dstTile.baseName = "a";
- srcTile.baseName = "b";
- sprintf(tmp, "void\n"
- "invertTile(%s *a, %s *b)\n",
- typeNameA, typeNameB);
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
- }
- else {
- kgenAddStmt(ctx, "// Invert tile\n");
- }
-
- // made destination block unit
- genZeroTile(ctx, &dstTile);
- for (i = 0; i < dim->y; i++) {
- genSetUnitInTile(ctx, &dstTile, i, i);
- }
- kgenAddBlankLine(ctx);
-
- for (i = 0; i < dim->y; i++) {
- Kstring src, srcDiag, dst, dstLast;
-
- // current source diagonal element
- sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU);
- for (j = i; j < dim->y; j++) {
- // current source non diagonal element
- if (i) {
- sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU);
- }
-
- for (k = 0; k < dim->y; k += accLen) {
- // current updated vectorized element
- sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU);
-
- // update
- if (i) {
- // last updated vectorized element
- sprintfInvertedElement(&dstLast, &dstTile, i - 1, k,
- accLen, isU);
- if (isComplex) {
- sprintf(tmp, "%s -= mul(%s, %s);\n",
- dst.buf, dstLast.buf, src.buf);
- }
- else {
- sprintf(tmp, "%s -= %s * %s;\n",
- dst.buf, dstLast.buf, src.buf);
- }
- kgenAddStmt(ctx, tmp);
- }
-
- // divide on the diagonal element
- if (j == i) {
- if (isComplex) {
- sprintf(tmp, "%s = div(%s, %s);\n",
- dst.buf, dst.buf, srcDiag.buf);
- }
- else {
- sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf);
- }
- kgenAddStmt(ctx, tmp);
- }
- }
- }
- if (i != dim->y - 1) {
- kgenAddBlankLine(ctx);
- }
- }
-
- if (!isInlined) {
- kgenEndFuncBody(ctx);
- }
- kgenAddBlankLine(ctx);
-
-}
-
-static void
-declareLocalVariables(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- Tile* parTile,
- TrsmExtraParams * extraParams)
-{
- char tmp[1024];
- const SubproblemDim *dims = gset->subdims;
- const char* parTileTypeName = NULL;
- bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags,
- MATRIX_B);
- unsigned int locWidth;
- unsigned int tsize;
- unsigned int parTileSize;
- unsigned int l1Pans;
- unsigned int step;
-
- kgenAddStmt(ctx,
- "const int lid = get_local_id(0);\n"
- "const int gid = get_group_id(0);\n"
- "GPtr uA, uB;\n"
- "uint coordA, coordB;\n"
- "uint m0 = 0, k0, m1;\n");
-
- if (isMatrixUpper(gset->kextra->flags)) {
- sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n",
- dims[0].y, dims[0].y);
- kgenAddStmt(ctx, tmp);
- }
-
- /*
- * Declare private blocks.
- * The region 'b' stores in different time tiles of both
- * the input matrices and the result
- */
-
- declareTileStorages(ctx, gset);
-
- *parTile = gset->tileBX;
-
- if (extraParams->ldsUse) {
- tsize = dtypeSize(gset->kextra->dtype);
- l1Pans = (unsigned int)(dims[0].x / dims[1].x);
-
- parTile->vecLen = (trb) ? (unsigned int)dims[1].x
- : (unsigned int)dims[1].bwidth;
- parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize);
- parTile->trans = trb;
-
- /*
- * Allocate enough space in the local area to fit several tiles
- * at the stage1 (according to the unrolled factor) and one tile
- * at the stage2
- */
-
- locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor;
- if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
- locWidth = umax(locWidth, (unsigned int)dims[1].y);
- }
- if (trb) {
- parTile->nrRows = locWidth;
- parTile->nrCols = (unsigned int)dims[0].x;
- step = (unsigned int)dims[1].x / parTile->vecLen;
- }
- else {
- parTile->nrRows = (unsigned int)dims[0].x;
- parTile->nrCols = locWidth;
- step = (unsigned int)dims[1].x * locWidth / parTile->vecLen;
- }
-
- parTileSize = tileVectorsNum(parTile);
-
- getVectorTypeName(gset->kextra->dtype, parTile->vecLen,
- &parTileTypeName, NULL);
-
- sprintf(tmp, "__local %s tmpB[%i];\n"
- "LPtr lB;\n"
- "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n",
- parTileTypeName, parTileSize, l1Pans, step);
- kgenAddStmt(ctx, tmp);
-
- if (useSkewedFetchB(gset)) {
- kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n",
- l1Pans, gset->subdims[1].x);
- }
- }
-
- kgenAddBlankLine(ctx);
-}
-
-/*
- * Generate cyclical tile shifting so as to convert the skewed
- * storing to "one-to-one", i. e. the first element in the tile
- * matches to the first element of the respective tile in the
- * output matrix.
- */
-static void
-genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset)
-{
- const char *tname;
- Kstring k1, k2, *src, *dst, *ktmp;
- unsigned int row, col;
- unsigned int seglen;
- Tile *tileC = &gset->tileCY;
-
- seglen = tileLineSegmentLen(tileC);
- getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL);
-
- kgenAddStmt(ctx, "\n// deliver from skewing in the result\n");
- kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)");
- kgenPrintf(ctx, "%s tmp;\n\n", tname);
-
- src = &k1;
- dst = &k2;
-
- // Skewing may be used only in case of transposed C
- for (row = 0; row < tileC->nrRows; row += seglen) {
- sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen);
- kgenPrintf(ctx, "tmp = %s;\n", dst->buf);
- for (col = tileC->nrCols - 1; col > 0; col--) {
- sprintfTileElement(src, tileC, row, col - 1, seglen);
- kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf);
- // swap pointer
- ktmp = src;
- src = dst;
- dst = ktmp;
- }
- kgenPrintf(ctx, "%s = tmp;\n", dst->buf);
- }
-
- kgenEndBranch(ctx, NULL);
- kgenAddBlankLine(ctx);
-}
-
-/*
- * Setup coordinates before beginning a trsm stage
- * A caller must ensure the strict stage sequence:
- * BLOCK_UPDATE -> TILE_UPDATE
- */
-static void
-genSetupCoords(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- enum TrsmStage stage)
-{
- char tmp[1024];
- KernelExtraFlags kflags = gset->kextra->flags;
- const SubproblemDim *dims = gset->subdims;
- unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
- const char *s;
-
- s = isMatrixUpper(kflags) ? "currM" : "m0";
- sprintf(tmp, "coordA = %s + (lid / %u * %lu);\n",
- s, l1Pans, dims[1].y);
- kgenAddStmt(ctx, tmp);
-
- switch (stage) {
- case BLOCK_UPDATE:
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "k0 = currM + %lu;\n", dims[0].y);
- }
- else {
- sprintf(tmp, "k0 = 0;\n");
- }
- break;
- case TILE_UPDATE:
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "k0 = currM + %lu - m1 * %lu;\n",
- dims[0].y - dims[1].y, dims[1].y);
- }
- else {
- sprintf(tmp, "k0 = m0 + m1 * %lu;\n", dims[1].y);
- }
- break;
- }
-
- kgenAddStmt(ctx, tmp);
-
- sprintf(tmp, "coordB = gid * %lu + (lid %% %u * %lu);\n",
- dims[0].x, l1Pans, dims[1].x);
-
- kgenAddStmt(ctx, tmp);
- kgenAddBlankLine(ctx);
-}
-
-// Generate control block of the loop over K
-static void
-genInternalLoopCtl(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- KernelExtraFlags kflags,
- size_t stepK,
- size_t boundAlign)
-{
- char tmp[1024];
-
- if (isMatrixUpper(kflags)) {
- if (kflags & KEXTRA_TAILS_M) {
- sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; "
- "k0 += %lu)",
- dim[0].y, boundAlign, boundAlign, stepK);
- }
- else {
- sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
- dim[0].y, stepK);
- }
- }
- else {
- sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
- stepK);
- }
-
- kgenBeginBranch(ctx, tmp);
-}
-
-static void
-initKernelVarNames(KernelVarNames *kvars)
-{
- kvars->A = "uA";
- kvars->B = "uB";
- kvars->C = "B";
- kvars->coordA = "coordA";
- kvars->coordB = "coordB";
- kvars->k = "k0";
- kvars->sizeM = "M";
- kvars->sizeN = "N";
- kvars->sizeK = "M";
- kvars->lda = "lda";
- kvars->ldb = "ldb";
- kvars->ldc = "ldb";
- kvars->alpha = "alpha";
- kvars->beta = "beta";
-}
-
-static void
-setFetchHandler(
- TileMulOpts *mulOpts,
- const BlasGenSettings *gset,
- int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv),
- TilePostFetchPrivate *priv)
-{
- int i, nrPrivs;
- const char *regName = NULL;
-
- if (handler == defaultTilePostFetch) {
- nrPrivs = 1;
- }
- else {
- nrPrivs = 2;
- regName = "b";
- }
-
- for (i = 0; i < nrPrivs; i++) {
- priv[i].fetchNumA = 0;
- priv[i].wholeA = 1;
- priv[i].funcID = CLBLAS_TRSM;
- priv[i].gset = gset;
- priv[i].regName = regName;
- mulOpts->postFetch = handler;
- mulOpts->postFetchPriv = priv;
- }
-}
-
-static void
-genCheckShiftTailB(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- int adjustRestore,
- TailStatus *tailStatus)
-{
- BlasGenSettings gsetNew;
- CLBLASKernExtra kextraNew;
-
- memcpy(&gsetNew, gset, sizeof(gsetNew));
- memcpy(&kextraNew, gset->kextra, sizeof(kextraNew));
- // avoid tail shift for the matrix A
- kextraNew.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
- gsetNew.kextra = &kextraNew;
-
- if (adjustRestore) {
- checkGenRestoreTailCoords(ctx, &gsetNew, *tailStatus);
- }
- else {
- *tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, &gsetNew,
- NULL);
- }
-}
-
-static void
-sprintfHitMatrixCond(
- char *buf,
- MatrixRole mrole,
- const char *prefix,
- const char *suffix)
-{
- const char *coordName;
- char bound;
-
- coordName = (mrole == MATRIX_A) ? "coordA" : "coordB";
- bound = (mrole == MATRIX_A) ? 'M' : 'N';
- if (suffix == NULL) {
- suffix = "";
- }
- sprintf(buf, "%s%s < %c%s", prefix, coordName, bound, suffix);
-}
-
-/*
- * 'mulUpd' arguments mean what action is being done: multiplication on
- * an inverted tile or subsequent update
- */
-static void
-sprintfStage2Condition(
- char *buf,
- const BlasGenSettings *gset,
- int mulUpd)
-{
- KernelExtraFlags kflags = gset->kextra->flags;
- char hitCond[1024];
- char *p;
- unsigned int xPans, yPans;
-
-
- hitCond[0] = '\0';
- xPans = (unsigned int)(gset->subdims[0].x / gset->subdims[1].x);
- yPans = (unsigned int)(gset->subdims[0].y / gset->subdims[1].y);
- if (kflags & KEXTRA_TAILS_M) {
- sprintfHitMatrixCond(hitCond, MATRIX_A, " && ", NULL);
- }
- p = hitCond + strlen(hitCond);
- if (kflags & KEXTRA_TAILS_N) {
- sprintfHitMatrixCond(p, MATRIX_B, " && ", NULL);
- }
-
- if (!mulUpd) {
- if (isMatrixUpper(kflags)) {
- sprintf(buf, "if (lid / %u + m1 == %u%s)",
- xPans, yPans - 1, hitCond);
- }
- else {
- sprintf(buf, "if (lid / %u == m1%s)", xPans, hitCond);
- }
- }
- else {
- if (isMatrixUpper(kflags)) {
- sprintf(buf, "if (lid / %u + m1 < %u%s)",
- xPans, yPans - 1, hitCond);
- }
- else {
- sprintf(buf, "if (lid / %u > m1%s)", xPans, hitCond);
- }
- }
-}
-
-static void
-genZeroTileTrash(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- MatrixRole mrole,
- Tile* tile)
-{
- char tmp[1024];
- const SubproblemDim *dim = &gset->subdims[1];
- const CLBLASKernExtra *kextra = gset->kextra;
- unsigned int i, j;
- unsigned int step;
- Kstring elem;
-
- if (mrole == MATRIX_A) {
- kgenAddBlankLine(ctx);
- }
- else {
- kgenBeginBranch(ctx, NULL);
- }
-
- sprintf(tmp, "const int bound = (coordA + %lu > M) ? (M - coordA) : %lu;\n",
- dim->y, dim->y);
- kgenAddStmt(ctx, tmp);
-
- step = tileLineSegmentLen(tile);
- step = (tile->trans) ? 1 : step;
-
- for (j = 0; j < tile->nrRows; ++j) {
- for (i = 0; i < tile->nrCols; i+=step) {
- sprintfTileElement(&elem, tile, j, i, step);
- sprintf(tmp, "%s = (bound <= %u) ? 0 : %s;\n", elem.buf, j, elem.buf);
- kgenAddStmt(ctx, tmp);
- }
- }
-
- // Set units in the trash diagonal elements for a tile of A
- if (mrole == MATRIX_A) {
- for (i = 0; i < (unsigned int)dim->y; i++) {
- sprintfTileElement(&elem, tile, i, i, 1);
- sprintf(tmp, "%s = (bound <= %d) ? %s : %s;\n",
- elem.buf, (int)i, strOne(kextra->dtype), elem.buf);
- kgenAddStmt(ctx, tmp);
- }
- }
-
- if (mrole == MATRIX_A) {
- kgenAddBlankLine(ctx);
- }
- else {
- kgenEndBranch(ctx, NULL);
- }
-}
-
-/*
- * NOTE: Before invoking this function 'tileA' must be initialized accordingly
- * so as it stores a square tile of the matrix A.
- */
-static void
-genMulOnDiagonalTile(
- struct KgenContext *ctx,
- BlasGenSettings *gset,
- TileSet *tileSet,
- const TileMulOpts *mulOpts)
-{
- char tmp[1024];
- FetchOpts fetchOpts;
- const SubproblemDim *dim = &gset->subdims[1];
- TilePostFetchPrivate pfPriv[2];
- TileMulOpts optsNew;
- const CLBLASKernExtra *extra = gset->kextra;
- CLBLASKernExtra extraNew;
- KernelExtraFlags kflags = extra->flags;
- Tile t;
- bool isTail;
-
- memset(&fetchOpts, 0, sizeof(fetchOpts));
- fetchOpts.regName = "b";
- fetchOpts.mrole = MATRIX_A;
- fetchOpts.lineOffset = 0;
- fetchOpts.linesNum = (unsigned int)dim->y;
-
- // setup options to multiply on the inverted tile
- memcpy(&optsNew, mulOpts, sizeof(TileMulOpts));
- optsNew.flags &= ~TILEMUL_TRB;
-
- kgenAddStmt(ctx, "// Fetch and invert the square tile located on the "
- "diagonal\n");
-
- // The matrix B play the role of A
- t = substituteTile(&gset->tileA, &tileSet->bAsSqA);
-
- isTail = ((kflags & KEXTRA_TAILS_M) != 0);
- genFetchInputTile(ctx, mulOpts->fctx, gset, &fetchOpts);
- setFetchHandler(&optsNew, gset, genTrxmPostFetchZero, pfPriv);
-
- /*
- * There is no needs in zeroing tail along K in case of the lower
- * triangular matrix because it is in the "other" triangle which is
- * never accessed
- */
- if (isTail && !isMatrixUpper(kflags)) {
- memcpy(&extraNew, extra, sizeof(extraNew));
- extraNew.flags &= ~KEXTRA_TAILS_K_LOWER;
- gset->kextra = &extraNew;
- }
- genTrxmPostFetchZero(ctx, MATRIX_A, pfPriv);
-
- /*
- * One must zero the tail part of a fetched square tile
- * in order to avoid influence of the trailing trash on the resulting
- * inverted tile (evaluating proceeds from the bottom towards the top
- * of the tile)
- */
- if (isTail) {
- genZeroTileTrash(ctx, gset, MATRIX_A, &gset->tileA);
- }
-
- restoreTile(&gset->tileA, &t);
-
- if(gset->flags & BGF_EXPLICIT_INLINE) {
- genTileInverting(ctx, gset, tileSet);
- }
- else {
- sprintf(tmp, "invertTile(%s, %s);\n\n",
- tileSet->squareA.baseName, tileSet->bAsSqA.baseName);
- kgenAddStmt(ctx, tmp);
- }
-
- gset->tileBX = tileSet->bAsC;
- genTileCopy(ctx, &gset->tileBX, &gset->tileCY, TILECOPY_ASSIGN);
-
- /*
- * For the lower diagonal not integrally decomposed matrix A
- * it's enough to zero the tail part of the result in order to
- * clear trash accumulated over the update loop
- */
- if (isTail && !isMatrixUpper(kflags)) {
- genZeroTileTrash(ctx, gset, MATRIX_B, &gset->tileBX);
- }
-
- genZeroTile(ctx, &gset->tileCY);
-
- genMulTiles(ctx, gset, &optsNew);
- kgenAddBlankLine(ctx);
-
- // restore original extra
- gset->kextra = extra;
-}
-
-static void
-genUpdateIntermResult(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- bool withMhitCond,
- UpdateResultFlags flags)
-{
- char tmp[1024];
- const char *coordY, *coordX;
- char *revAlp, *alp;
- DataType dtype = gset->kextra->dtype;
- KernelExtraFlags kflags = gset->kextra->flags;
- const SubproblemDim *dim = &gset->subdims[1];
- const KernelVarNames *kvarNames = &gset->varNames;
- UpdateResultOp op;
- UpresVarNames uvars;
- const char* ctype;
-
- memset(&uvars, 0, sizeof(uvars));
-
- op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;
-
- uvars.startRow = kvarNames->coordA;
- uvars.startCol = kvarNames->coordB;
- uvars.nrRows = "y";
- uvars.nrCols = "x";
- uvars.result = "B";
- uvars.ld = "ldb";
-
- ctype = dtypeBuiltinType(dtype);
- if (isComplexType(dtype)) {
- if (dtype == TYPE_COMPLEX_FLOAT) {
- revAlp = "div((float2)(-1.f, 0), alpha)";
- alp = "(float2)(1.f, 0)";
- }
- else {
- revAlp = "div((double2)(-1., 0), alpha)";
- alp = "(double2)(1., 0)";
- }
- }
- else {
- revAlp = "-1. / alpha";
- alp = "1.";
- }
-
- // inline result update
- flags |= UPRES_INLINE;
-
- coordY = kvarNames->coordA;
- coordX = kvarNames->coordB;
-
- /*
- * We should be careful here.
- *
- * The non tailed case of updateResult() is rewritted.
- * Now update result for tailed and non tailed cases have a bit
- * different semantics.
- *
- * The first one produces expressions like
- * 'dst = dst * beta + src * alpha'.
- *
- * Here 'dst' and 'src' may be private result stored in registers or
- * result to be updated in the global memory. Let the first one to be
- * designated as tileC and the second one as matC.
- *
- * The non tailed case produces expressions like
- * 'dst = matC * beta + tileC * alpha'.
- *
- * The second variant is more clear and native for the new implementation.
- * But as the difference is not eliminated, both the variants are
- * maintained here.
- */
-
- if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) {
- kgenBeginBranch(ctx, "");
-
- sprintf(tmp, "%s %s = %s;\n"
- "%s alpha = beta;\n",
- ctype, "beta", revAlp, ctype);
- kgenAddStmt(ctx, tmp);
-
- updateResultGen(ctx,
- gset,
- CLBLAS_TRSM,
- op,
- flags & ~UPRES_WITH_BETA,
- &uvars);
-
- kgenEndBranch(ctx, NULL);
- }
- else {
- if (withMhitCond) {
- sprintf(tmp, "if ((%s < %s) && (%s < %s))",
- coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
- kgenBeginBranch(ctx, tmp);
- }
- else {
- /* for x, y variables scope */
- kgenBeginBranch(ctx, NULL);
- }
-
- sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
- "uint x = min(%luu, %s - (uint)%s);\n",
- dim->y, kvarNames->sizeM, coordY,
- dim->x, kvarNames->sizeN, coordX);
- kgenAddStmt(ctx, tmp);
-
- sprintf(tmp, "if ((y == %lu) && (x == %lu))",
- dim->y, dim->x);
- kgenBeginBranch(ctx, tmp);
-
- sprintf(tmp, "%s %s = %s;\n"
- "%s alpha = beta;\n",
- ctype, "beta", revAlp, ctype);
- kgenAddStmt(ctx, tmp);
-
- // optimized update
- updateResultGen(ctx,
- gset,
- CLBLAS_TRSM,
- op,
- flags & ~UPRES_WITH_BETA,
- &uvars);
-
- kgenEndBranch(ctx, NULL);
-
- flags |= UPRES_GENERIC;
- kgenBeginBranch(ctx, "else ");
-
- sprintf(tmp, "%s %s = %s;\n"
- "%s %s = %s;\n",
- ctype, "beta", revAlp,
- ctype, "alpha", alp);
- kgenAddStmt(ctx, tmp);
-
- // not optimized update
- updateResultGen(ctx,
- gset,
- CLBLAS_TRSM,
- op,
- flags,
- &uvars);
-
- kgenEndBranch(ctx, NULL);
- kgenEndBranch(ctx, NULL);
- }
-}
-
-static void
-genPreloadedTileMul(
- struct KgenContext *ctx,
- BlasGenSettings *gset,
- TileMulOpts *mulOpts,
- const Tile *parTile,
- const char* copy2LDSFuncName)
-{
- char tmp[1024];
- KernelExtraFlags kflags = gset->kextra->flags;
- unsigned int bwidthOld;
- const char *oldNameB;
- const char *ptrName;
-
- getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName);
- kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
- sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n",
- copy2LDSFuncName, gset->subdims[0].x);
- }
- else {
- sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n",
- copy2LDSFuncName, gset->subdims[0].x);
- }
- kgenAddStmt(ctx, tmp);
-
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
-
- kgenAddStmt(ctx, "lB = lBMain;\n\n");
-
- mulOpts->memB = CLMEM_LOCAL_MEMORY;
- oldNameB = gset->varNames.B;
- bwidthOld = (unsigned int)gset->subdims[0].bwidth;
- gset->varNames.B = "lB";
- gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows :
- parTile->nrCols;
-
- tileMulGen(ctx, gset, mulOpts);
-
- gset->varNames.B = oldNameB;
- gset->subdims[0].bwidth = bwidthOld;
- mulOpts->memB = CLMEM_GLOBAL_MEMORY;
-}
-
-static void
-initTiles(
- BlasGenSettings* gset,
- TileSet* tileSet,
- const struct SubproblemDim *subdims,
- KernelExtraFlags kflags,
- DataType dtype,
- PrivateStorageType storType)
-{
- unsigned int rowsA;
- unsigned int rowsB;
- unsigned int rowsC;
- unsigned int colsA;
- unsigned int colsB;
- unsigned int colsC;
- bool transA;
- bool transB;
- unsigned int vecLenA;
- unsigned int vecLenB;
- unsigned int vecLenC;
-
- rowsA = (unsigned int)subdims[1].y;
- colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);
-
- rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);
- colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y);
-
- rowsC = (unsigned int)subdims[1].y;
- colsC = (unsigned int)subdims[1].x;
-
- transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
-
- vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth);
- vecLenA = umin(vecLenA, MAX_TILE_VECLEN);
- vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth);
- vecLenB = umin(vecLenB, MAX_TILE_VECLEN);
- vecLenC = (transB) ? vecLenB : vecLenA;
-
- initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y,
- (unsigned int)subdims[1].bwidth, vecLenA, dtype,
- storType, transA, false);
-
- initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y,
- (unsigned int)subdims[1].y, vecLenA, dtype, storType,
- transA, false);
-
- initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth,
- (unsigned int)subdims[1].x, vecLenB, dtype, storType,
- !transB, false);
-
- initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y,
- (unsigned int)subdims[1].x, vecLenB, dtype, storType,
- !transB, false);
-
- initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y,
- (unsigned int)subdims[1].y, vecLenB, dtype, storType,
- transA, false);
-
- initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y,
- (unsigned int)subdims[1].x, vecLenB, dtype, storType,
- gset->tileCY.trans, false);
-
- initTile(&gset->tileA, "a", rowsA, colsA,
- vecLenA, dtype, storType, transA, false);
-
- initTile(&gset->tileBX, "b", rowsB, colsB,
- vecLenB, dtype, storType, !transB, false);
-
- initTile(&gset->tileCY, "c", rowsC, colsC,
- vecLenC, dtype, storType, !transB, false);
-
- tileSet->A = gset->tileA;
- tileSet->B = gset->tileBX;
-}
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- char tmp[1024];
- struct KgenContext *ctx;
- ssize_t ret;
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- DataType dtype = kextra->dtype;
- KernelExtraFlags kflags = kextra->flags;
- CLBLASKernExtra extraNew;
- BlasGenSettings gset;
- TileMulOpts mulOpts;
- const char *ptrName;
- UpdateResultFlags upFlags = 0;
- TilePostFetchPrivate pfPriv;
- unsigned int l1Pans;
- bool b;
- Tile parTile;
- TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
- int ldsLarge, lds_diagonal;
- bool isInline;
- TileSet tileSet;
- char copy2LDSFuncName[FUNC_NAME_MAXLEN];
- TailStatus tailStatus = 0;
- FetchAddrMode addrMode = 0;
- bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
- bool tailN = ((kflags & KEXTRA_TAILS_N) != 0);
- size_t alignK;
-
- if (pgran->wgDim != 1) {
- return -EINVAL;
- }
-
- l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);
-
- memset(&gset, 0, sizeof(gset));
- gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS;
- memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2);
- // there is not need in block structure along K
- gset.subdims[0].bwidth = gset.subdims[1].bwidth;
- subdims = gset.subdims;
-
- /*
- * Since tiles are changed dynamically, e. g. in the main tilemul
- * loop they are rectangular, but at the second stage both A and B
- * tile storages are used for square tiles. One must adjust physical
- * vectorization accordindly, so as vector length might not be
- * greater than linear size of any tile
- */
- memcpy(&extraNew, kextra, sizeof(extraNew));
- extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y);
- extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y);
-
- gset.pgran = pgran;
- gset.kextra = &extraNew;
- initKernelVarNames(&gset.varNames);
-
- // multiplication options
- mulOpts.memA = CLMEM_GLOBAL_MEMORY;
- mulOpts.memB = CLMEM_GLOBAL_MEMORY;
- mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD :
- TILEMUL_MULADD;
- mulOpts.postFetch = NULL;
- mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
- mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K;
- mulOpts.fctx = createFetchContext();
- if (mulOpts.fctx == NULL) {
- return -ENOMEM;
- }
-
- disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING);
-
- isInline = (gset.flags & BGF_EXPLICIT_INLINE);
-
- initTiles(&gset, &tileSet, subdims, kflags, dtype,
- PRIV_STORAGE_VARIABLE_SET);
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- destroyFetchContext(mulOpts.fctx);
- return -ENOMEM;
- }
-
- kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n");
-
- b = isDoubleBasedType(dtype);
- kgenDeclareUptrs(ctx, b);
- if (isComplexType(dtype)) {
- genComplexMathOperators(ctx, dtype);
- }
- if(!isInline) {
- genTileInverting(ctx, &gset, &tileSet);
- }
-
- if ( extraParams->ldsUse != LDS_NO_USE ) {
- SubproblemDim sdims;
- DBlockCopyFlags flags;
- unsigned int vecLen;
-
- if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
- sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor;
- sdims.y = gset.subdims[0].x;
- }
- else {
- sdims.x = gset.subdims[0].x;
- sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor;
- }
-
- vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B);
- flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0;
- copyDataBlockGen(ctx, &sdims, gset.pgran, dtype,
- DBLOCK_GLOBAL_TO_LOCAL, flags);
- kgenAddBlankLine(ctx);
- kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx);
- }
-
- declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
- true);
- kgenBeginFuncBody(ctx);
-
- declareLocalVariables(ctx, &gset, &parTile, extraParams);
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- kgenAddStmt(ctx, "A += offA;\n");
- }
- genTrxmBMatrShift(ctx, kflags, false);
-
- ptrName = dtypeUPtrField(dtype);
-
- sprintf(tmp, "uB.%s = B;\n\n", ptrName);
- kgenAddStmt(ctx, tmp);
-
- // external loop
- sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
- kgenBeginBranch(ctx, tmp);
- genZeroTile(ctx, &gset.tileCY);
- genSetupCoords(ctx, &gset, BLOCK_UPDATE);
-
- kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n");
-
- gset.tileA = tileSet.rectA;
- gset.tileBX = tileSet.origB;
-
- if (!isMatrixUpper(kflags) && tailM) {
- addrMode |= FETCH_ADDR_A_CYCLICAL;
- setFetchAddrMode(mulOpts.fctx, addrMode);
- }
-
- ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0);
- alignK = subdims[1].bwidth;
- if (ldsLarge) {
- alignK *= extraParams->unrollingFactor;
- }
-
- if (ldsLarge) {
- const char *oldCoordB;
- FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE;
- bool withSkew;
-
- withSkew = useSkewedFetchB(&gset);
- if (!withSkew) {
- bamode |= FETCH_ADDR_B_RELATIVE;
- }
- else {
- bamode |= FETCH_ADDR_B_CYCLICAL;
- }
-
- setFetchAddrMode(mulOpts.fctx, bamode);
-
- if (tailN) {
- /*
- * Conditional branch for those items which hit into
- * matrix B with their matrix coordinates
- */
- sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x);
- kgenBeginBranch(ctx, tmp);
- }
-
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
- kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName);
- }
- else {
- kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName);
- }
-
- if (withSkew) {
- unsigned int bwidthOld;
-
- oldCoordB = gset.varNames.coordB;
- gset.varNames.coordB = "skewX";
- bwidthOld = gset.subdims[0].bwidth;
- gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows :
- parTile.nrCols;
- gset.subdims[0].bwidth = bwidthOld;
- }
-
- genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK);
- genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName);
- genInternalLoopEnd(ctx); // loop over K
-
- if (withSkew) {
- gset.varNames.coordB = oldCoordB;
- setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL);
- // deliver from skew in the result before proceed to the next stage
- genTileCyclicalShift(ctx, &gset);
- }
-
- if (tailN) {
- kgenEndBranch(ctx, NULL);
- kgenBeginBranch(ctx, "else");
- }
-
- setFetchAddrMode(mulOpts.fctx, addrMode);
- }
-
- if (!ldsLarge || tailN) {
- genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
- if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) {
- addrMode |= FETCH_ADDR_B_CYCLICAL;
- setFetchAddrMode(mulOpts.fctx, addrMode);
- }
-
- if (tailN) {
- sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")");
- kgenBeginBranch(ctx, tmp);
- }
-
- genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK);
- tileMulGen(ctx, &gset, &mulOpts);
- genInternalLoopEnd(ctx); // loop over K
-
- if (tailN) {
- kgenEndBranch(ctx, NULL);
- }
-
- if (extraParams->ldsUse & LDS_USE_LARGE) {
- kgenEndBranch(ctx, NULL);
- }
- }
-
- sprintf(tmp, "uA.%s = A;\n\n", ptrName);
- kgenAddStmt(ctx, tmp);
-
- // processing tails along update dimension
- if (isMatrixUpper(kflags) &&
- ((kflags & KEXTRA_TAILS_K_LOWER) ||
- (ldsLarge && extraParams->unrolledTail))) {
-
- unsigned int tailChunks;
-
- tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ?
- extraParams->unrolledTail : 1;
-
- if (tailN) {
- char hitCond[1024];
-
- sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")");
- sprintf(tmp, "if ((currM + %lu < M) && %s)",
- subdims[0].y, hitCond);
- }
- else {
- sprintf(tmp, "if (currM + %lu < M)", subdims[0].y);
- }
- kgenBeginBranch(ctx, tmp);
-
- if (kflags & KEXTRA_TAILS_K_LOWER) {
- setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL);
- setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
- }
- if (tailChunks > 1) {
- mulOpts.flags &= ~TILEMUL_NOT_INC_K;
- sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks);
- kgenBeginBranch(ctx, tmp);
- }
-
- addrMode |= FETCH_ADDR_B_CYCLICAL;
- setFetchAddrMode(mulOpts.fctx, addrMode);
- tileMulGen(ctx, &gset, &mulOpts);
- if (tailChunks > 1) {
- kgenEndBranch(ctx, NULL);
- mulOpts.flags |= TILEMUL_NOT_INC_K;
- }
-
- kgenEndBranch(ctx, NULL);
- }
-
- gset.tileA = tileSet.squareA;
-
- kgenAddStmt(ctx, "\n/*\n"
- " * Stage 2. A part of work items multiply got result on "
- "a respective\n"
- " * inverted diagonal block, and the remaining ones wait. "
- "Then they perform\n"
- " * one step of further intermediate result evaluation as "
- "multiplying tile by tile.\n"
- " * It continues until the whole panel of the "
- "matrix A is processed\n"
- " */\n");
-
- // one must deal further with square blocks strictly
- gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y;
-
- sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y);
- kgenBeginBranch(ctx, tmp);
-
- if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
- sprintf(tmp, "const int bid = lid %% %u;\n\n",
- l1Pans);
- kgenAddStmt(ctx, tmp);
- }
-
- /*
- * Update the intermediate result multiply on the inverted diagonal tile,
- * and write back
- */
- genSetupCoords(ctx, &gset, TILE_UPDATE);
-
- sprintfStage2Condition(tmp, &gset, 0);
- ret = kgenBeginBranch(ctx, tmp);
-
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= tailStatusToUpresFlags(tailStatus);
- upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA;
- genUpdateIntermResult(ctx, &gset, false, upFlags);
-
- kgenAddBlankLine(ctx);
-
- lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) &&
- (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 &&
- !(tailM || tailN) &&
- !(upFlags & UPRES_NO_VECTORIZATION) &&
- !isComplexType(kextra->dtype));
-
- /*
- * it's needed now to adjust addressing mode of A so as to don't
- * exceed the bound of A
- */
- if (tailM) {
- setFetchAddrMode(mulOpts.fctx,
- addrMode | FETCH_ADDR_A_CYCLICAL |
- FETCH_ADDR_K_CYCLICAL);
- extraNew.flags |= KEXTRA_TAILS_K_LOWER;
- }
-
- genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts);
- gset.tileBX = tileSet.bStage2;
- if (tailM) {
- setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
- }
-
- kgenAddStmt(ctx, "// Write back the given result\n");
-
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= tailStatusToUpresFlags(tailStatus);
-
- if (lds_diagonal) {
- sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans);
- }
-
- genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags,
- NULL, NULL, lds_diagonal ? tmp : NULL);
-
- kgenEndBranch(ctx, NULL); // multiply on the inverted tile path
- kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
- // continue the tile update
- kgenAddBlankLine(ctx);
- sprintfStage2Condition(tmp, &gset, 1);
- kgenBeginBranch(ctx, tmp);
- genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
- if (lds_diagonal) {
- // TODO: add here storing to LDS as well
- }
- else {
- addrMode |= FETCH_ADDR_B_CYCLICAL;
- setFetchAddrMode(mulOpts.fctx, addrMode);
- tileMulGen(ctx, &gset, &mulOpts);
- }
- kgenEndBranch(ctx, NULL); // tile update path
- kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
- kgenEndBranch(ctx, NULL); // second stage loop
-
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
- kgenAddStmt(ctx, tmp);
- }
-
- kgenEndBranch(ctx, NULL); // loop over M
-
- ret = kgenEndFuncBody(ctx);
-
- if (!ret) {
- ret = (ssize_t)kgenSourceSize(ctx) + 1;
- }
-
- destroyFetchContext(mulOpts.fctx);
- destroyKgenContext(ctx);
-
- return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs)
-{
- (void)dim;
- (void)dtype;
- (void)ldsSize;
- (void)kernelArgs;
-
- return true;
-}
-
-static SolverFlags
-solverFlags(void)
-{
- return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra)
-{
- const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
- KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
- int idx = 7;
-
- initSizeKarg(&args[0], blasArgs->M);
- initSizeKarg(&args[1], blasArgs->N);
- assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
- initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
- initSizeKarg(&args[4], blasArgs->lda.matrix);
- initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
- initSizeKarg(&args[6], blasArgs->ldb.matrix);
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offA);
- }
- if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx], blasArgs->offBX);
- }
-}
-
-static void
-fixupArgs(void *args, SubproblemDim *subdims, void *extra)
-{
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- CLBlasKargs *kargs = (CLBlasKargs*)args;
- TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
- size_t loadBatch;
- unsigned int wgSize;
- unsigned int workRatio;
- unsigned int ldsUse = LDS_NO_USE;
- KernelExtraFlags kflags = kextra->flags;
- SubproblemDim globDim;
- bool isAmdGPU;
-
- /*
- * Calculate size of the batch loaded from global to local memory
- * at each iteration of the stage 1. Choose such unrolling factor
- * that allow each work item to load at least 16 bytes that provides
- * efficient global memory access
- */
- loadBatch = subdims[0].x * subdims[1].bwidth * dtypeSize(kargs->dtype);
- wgSize = (unsigned int)((subdims[0].x / subdims[1].itemX) *
- (subdims[0].y / subdims[1].itemY));
- if (loadBatch < wgSize) {
- workRatio = 1;
- }
- else {
- workRatio = 16 / ((unsigned int)loadBatch / wgSize);
- if (!workRatio) {
- workRatio = 1;
- }
- }
-
-#ifndef NDEBUG
- {
- const char *envImpl = getenv("AMD_CLBLAS_TRSM_LDSUSE");
-
- if (envImpl != NULL) {
- unsigned int w = atoi(envImpl);
- ldsUse = w % 10;
- w = w / 10;
- workRatio = w > 0 ? w : workRatio;
- }
- }
-#endif
-
- ldsUse = LDS_NO_USE;
- isAmdGPU = ((kflags & KEXTRA_VENDOR_AMD) != 0);
- if ((isAmdGPU && !(kflags & (KEXTRA_TAILS_K_LOWER | KEXTRA_TAILS_M_LOWER)))
- || (!isAmdGPU && !(kflags & KEXTRA_TAILS_M))) {
-
- ldsUse = LDS_USE_LARGE;
- }
-
- kargsToProbDims(&globDim, CLBLAS_TRSM, args, false);
- extraParams->ldsUse = ldsUse;
- extraParams->unrollingFactor = workRatio;
- extraParams->unrolledTail = (unsigned int)(((globDim.bwidth %
- (subdims[1].bwidth * workRatio)) + subdims[1].bwidth - 1) /
- subdims[1].bwidth);
-
- fixupTrxmKargs(kargs);
-}
-
-static bool
-checkCalcDecompDedicated(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- DataType dtype,
- int check)
-{
- bool ret = true;
-
- DUMMY_ARG_USAGE(subdimsNum);
-
- if (check == PGRAN_CHECK) {
- unsigned int minSize, maxSize;
-
- maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
- minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
- ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
- ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
- ret = ret && (pgran->wgSize[0] == 64);
- }
- else {
- calcPgranDedicated(pgran, subdims, -1, 3);
- }
-
- return ret;
-}
-
-void
-initTrsmLdsLessCachedPattern(MemoryPattern *mempat)
-{
- mempat->name = "2-staged cached global memory based block trsm";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 0;
- mempat->sops = &trsmSops;
-
- mpatExtra.aMset = CLMEM_LEVEL_L1;
- mpatExtra.bMset = CLMEM_LEVEL_L1;
- mpatExtra.mobjA = CLMEM_BUFFER;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-}
-
-#if 0
-
-static int
-getDefaultDecomp(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- void * pArgs)
-{
- pgran->wgDim = 1;
- pgran->wgSize[0] = 64;
- pgran->wgSize[1] = 1;
-
- subdims[0].x = subdims[0].itemX = 32;
- subdims[0].y = 64;
- subdims[0].itemY = SUBDIM_UNUSED;
- subdims[0].bwidth = subdims[1].bwidth = 4;
- subdims[1].x = subdims[1].itemX = 8;
- subdims[1].y = subdims[1].itemY = 4;
-}
-
-#endif