summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c')
-rw-r--r--external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c1005
1 files changed, 0 insertions, 1005 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c b/external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c
deleted file mode 100644
index 828da2ad..00000000
--- a/external/clBLAS/src/library/blas/gens/legacy/trsm_cached_lds.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * TRSM generator with support of cached reads from the global memory
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <clblas_stddef.h>
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-#include <matrix_props.h>
-#include <matrix_dims.h>
-
-#include "../blas_kgen.h"
-#include "../trxm_common.h"
-#include "trsm_kgen_legacy.h"
-#include "gen_helper_legacy.h"
-#include "../trsm_kgen.h"
-
-static const char *readSquareBlock =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // just read with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockOpt =
- // just read with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
-
-static const char *readSquareBlockTrans =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // read and transpose with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- // read and transpose with slow function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockTransOpt =
- // read and transpose with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
-
-static CLBLASMpatExtra mpatExtra;
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs);
-
-static SolverFlags
-solverFlags(void);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static void
-fixupArgs(void *args, SubproblemDim *subdims, void *extra);
-
-static SolverOps trsmSops = {
- generator,
- assignKargs,
- isFitToLDS,
- NULL,
- NULL,
- NULL,
- NULL,
- solverFlags,
- fixupArgs,
- NULL, //getDefaultDecomp
- NULL, // getDecompList
- NULL,
- NULL
-};
-
-static TileMulFlags
-getCyclicFlags(
- const SubproblemDim *dim,
- KernelExtraFlags kflags,
- bool tailPass,
- unsigned int vecLen)
-{
- TileMulFlags mflags = TILEMUL_NO_FLAGS;
-
- if (tailPass && !isMatrixUpper(kflags)) {
- mflags |= TILEMUL_GLOBAL_CYCLIC_A;
- }
-
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) &&
- (kflags & KEXTRA_TAILS_N) && (dim->x > vecLen)) {
-
- mflags |= TILEMUL_GLOBAL_CYCLIC_B;
- }
-
- return mflags;
-}
-
-static void
-initTiles(BlasGenSettings *gset)
-{
- unsigned int nrRows, nrCols;
- unsigned int vecLen;
- const SubproblemDim *dim = &gset->subdims[1];
- const CLBLASKernExtra *kextra = gset->kextra;
- DataType dtype = kextra->dtype;
- bool tra;
-
- // the tile A should be able to fit rectangular and square tiles
- nrCols = (unsigned int)szmax(dim->y, dim->bwidth);
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_A);
- vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_A);
- initTile(&gset->tileA, "a", (unsigned int)dim->y, nrCols, vecLen,
- dtype, PRIV_STORAGE_ARRAY, tra, false);
-
- /*
- * tile B should be able to fit tiles of the matrix B and of the
- * intermediate result. That result will be always transposed
- * from the point of view of tile multiplication
- */
- tra = !isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
- if (tra) {
- nrRows = (unsigned int)szmax(dim->bwidth, dim->y);
- nrCols = (unsigned int)dim->x;
- }
- else {
- nrRows = (unsigned int)szmax(dim->bwidth, dim->x);
- nrCols = (unsigned int)szmax(dim->x, dim->y);
- }
- vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_B);
- initTile(&gset->tileBX, "b", nrRows, nrCols, vecLen, dtype,
- PRIV_STORAGE_ARRAY, tra, false);
-
- initTile(&gset->tileCY, "c", (unsigned int)dim->y, (unsigned int)dim->x,
- vecLen, dtype, PRIV_STORAGE_ARRAY, false, false);
-}
-
-static void
-prepareTilesForMainLoop(BlasGenSettings *gset)
-{
- const SubproblemDim *dim = &gset->subdims[1];
-
- gset->tileA.nrCols = (unsigned int)dim->bwidth;
- gset->tileBX.nrRows = (unsigned int)dim->bwidth;
- gset->tileBX.nrCols = (unsigned int)dim->x;
-}
-
-static void
-declareLocalVariables(
- struct KgenContext *ctx,
- const BlasGenSettings *gset)
-{
- char tmp[1024];
- const char *elemType;
- const SubproblemDim *dims = gset->subdims;
- DataType dtype = gset->kextra->dtype;
- size_t pitchAC, heightC;
-
- elemType = dtypeBuiltinType(dtype);
- pitchAC = matrBlockPitch(dims, MATRIX_C, dtype, clblasRight);
- heightC = szmax(dims[0].y, dims[0].x);
-
- declareTileStorages(ctx, gset);
- sprintf(tmp, "const int lid = get_local_id(0);\n"
- "const int gid = get_group_id(0);\n"
- "const uint2 skewRow = 0, skewCol = 0;\n\n"
- "GPtr uA, uB;\n"
- "uint coordA, coordB, k;\n"
- "uint x, y;\n"
- "__local %s tempA[%lu], tempC[%lu];\n"
- "LPtr utmpA, utmpC;\n"
- "uint m0 = 0, k0, currM, currN;\n",
- elemType, pitchAC * dims[0].y, pitchAC * heightC);
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genReadDiagBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- DataType dtype,
- const CopyBufFuncs *copyFuncs,
- const ZeroFuncs *zeroFuncs,
- KernelExtraFlags kflags,
- char c)
-{
- char tmp[1024];
- size_t pitch;
- const char *readBlock;
- bool tra;
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
-
- if (!(kflags & KEXTRA_TAILS_M)) {
- readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
- sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
- }
- else {
- readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
- sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
- dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
- zeroFuncs->names[MATRIX_A], c,
- copyFuncs->readGeneric[MATRIX_A], c, pitch);
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genZeroResult(
- struct KgenContext *ctx,
- DataType dtype,
- const SubproblemDim *dims,
- unsigned int vecLen)
-{
- unsigned int n;
- char tmp[1024];
-
- getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
-
- sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
- " c[x] = 0;\n"
- "}\n\n", n);
-
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genInternalLoopCtl(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- KernelExtraFlags kflags)
-{
- char tmp[1024];
-
- if (isMatrixUpper(kflags)) {
- if (kflags & KEXTRA_TAILS_M) {
- sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; "
- "k0 += %lu)",
- dim[0].bwidth, dim[1].bwidth, dim[1].bwidth, dim[1].bwidth);
- }
- else {
- sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
- dim[0].bwidth, dim[1].bwidth);
- }
- }
- else {
- sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)",
- dim[1].bwidth);
- }
-
- kgenBeginBranch(ctx, tmp);
-}
-
-static void
-genInitCurrM(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- KernelExtraFlags kflags)
-{
- char tmp[1024];
-
- if (isMatrixUpper(kflags)) {
- /* start from the last block */
- sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y);
- kgenAddStmt(ctx, tmp);
- }
- else {
- kgenAddStmt(ctx, "currM = 0;\n");
- }
-}
-
-static void
-initKernelVarNames(KernelVarNames *kvars)
-{
- kvars->A = "uA";
- kvars->B = "uB";
- kvars->coordA = "coordA";
- kvars->coordB = "coordB";
- kvars->k = "k";
- kvars->sizeM = "M";
- kvars->sizeN = "N";
- kvars->sizeK = "M";
- kvars->lda = "lda";
- kvars->ldb = "ldb";
-}
-
-/*
- * Generate a code copying tile between LDS and private location.
- */
-static void
-genLdsCopy(
- struct KgenContext *ctx,
- const BlasGenSettings *gset)
-{
- char pitchStr[16];
- char coordY[128], coordX[128];
- size_t pitch;
- UpresVarNames uvars;
- UpdateResultFlags upFlags = UPRES_INLINE | UPRES_USE_LDS |
- UPRES_WITHOUT_ALPHA | UPRES_COLUMN_MAJOR;
- const SubproblemDim *dims = gset->subdims;
- unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
-
- memset(&uvars, 0, sizeof(uvars));
-
- pitch = matrBlockPitch(dims, MATRIX_C, gset->kextra->dtype, clblasRight);
- sprintf(pitchStr, "%lu", pitch);
- sprintf(coordY, "lid / %u * %lu", l1Pans, dims[1].y);
- sprintf(coordX, "lid %% %u * %lu", l1Pans, dims[1].x);
- uvars.result = "tempC";
- uvars.ld = pitchStr;
- uvars.startRow = coordY;
- uvars.startCol = coordX;
- uvars.nrRows = NULL;
- uvars.nrCols = NULL;
-
- kgenBeginBranch(ctx, NULL);
-
- updateResultGen(ctx,
- gset,
- CLBLAS_TRSM,
- UPRES_SET,
- upFlags,
- &uvars);
-
- kgenEndBranch(ctx, NULL);
-
- kgenAddBlankLine(ctx);
-}
-
-static void
-genZeroResultTrash(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- const CLBLASKernExtra *kextra)
-{
- char tmp[1024];
- unsigned int vecLen, pitch;
- unsigned int i;
-
- vecLen = (isComplexType(kextra->dtype)) ? 1 : kextra->vecLen;
- pitch = (unsigned int)roundUp(dim->x, vecLen);
- sprintf(tmp, "if (coordA + %lu > M)", dim->y);
- kgenBeginBranch(ctx, tmp);
- sprintf(tmp, "int i = (coordA >= M) ? %lu : (%lu - M %% %lu);\n\n",
- dim->y, dim->y, dim->y);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "for (; i > 0; i--)");
- kgenBeginBranch(ctx, tmp);
-
- for (i = 0; i < pitch / vecLen; i++) {
- sprintf(tmp, "c[(%lu - i) * %u + %u] = 0;\n",
- dim->y, pitch / vecLen, i);
- kgenAddStmt(ctx, tmp);
- }
-
- kgenEndBranch(ctx, NULL);
- kgenEndBranch(ctx, NULL);
-}
-
-static void
-setupVdepUpresFlags(KernelExtraFlags kflags, UpdateResultFlags* upFlags)
-{
- bool forceBug = false;
-
- unsigned int bugFlag1 = KEXTRA_NO_COPY_VEC_A
- | KEXTRA_TAILS_K
- | KEXTRA_TAILS_M;
- unsigned int bugFlag2 = bugFlag1
- | KEXTRA_UPPER_TRIANG
- | KEXTRA_TRANS_A;
- unsigned int bugFlag3 = bugFlag1
- | KEXTRA_SIDE_RIGHT
- | KEXTRA_COLUMN_MAJOR;
- unsigned int bugFlag4 = bugFlag3
- | KEXTRA_TRANS_A;
- unsigned int bugFlag5 = bugFlag3
- | KEXTRA_UPPER_TRIANG;
- unsigned int bugFlag6 = KEXTRA_NO_COPY_VEC_A
- | KEXTRA_NO_COPY_VEC_B
- | KEXTRA_NO_COPY_VEC_C
- | KEXTRA_TAILS_K
- | KEXTRA_TAILS_M;
- unsigned int bugFlag7 = bugFlag6
- | KEXTRA_COLUMN_MAJOR;
- unsigned int bugFlag8 = bugFlag6
- | KEXTRA_SIDE_RIGHT
- | KEXTRA_UPPER_TRIANG;
- unsigned int bugFlag9 = bugFlag6
- | KEXTRA_UPPER_TRIANG
- | KEXTRA_TRANS_A
- | KEXTRA_TAILS_N;
- unsigned int bugFlag10 = bugFlag7
- | KEXTRA_SIDE_RIGHT
- | KEXTRA_TRANS_A
- | KEXTRA_TAILS_N;
- unsigned int bugFlag11 = bugFlag9
- | KEXTRA_UNIT_DIAGONAL;
- unsigned int bugFlag12 = bugFlag6
- | KEXTRA_TAILS_N
- | KEXTRA_SIDE_RIGHT
- | KEXTRA_UNIT_DIAGONAL
- | KEXTRA_COLUMN_MAJOR
- | KEXTRA_TRANS_A;
-
- /*
- * WORKAROUND for AMD GPU: Now, we avoid optimizing the case when
- * matrix B is not divided on block size and
- * since it leads to a hang up at code seeming
- * correct.
- */
- if (kflags & KEXTRA_VENDOR_AMD) {
- forceBug = (kflags & KEXTRA_TAILS_N) != 0;
- }
- else {
- forceBug = (kflags != bugFlag1
- && kflags != bugFlag2 && kflags != bugFlag4 && kflags != bugFlag5
- && kflags != bugFlag7 && kflags != bugFlag8 && kflags != bugFlag9
- && kflags != bugFlag10 && kflags != bugFlag11
- && kflags != bugFlag12);
- }
-
- if (!forceBug) {
- *upFlags |= UPRES_INDEXING_WITH_CONSTANTS;
- }
-}
-
-static void
-genSetupCoordinates(
- struct KgenContext *ctx,
- const SubproblemDim *dims,
- KernelExtraFlags kflags)
-{
- char tmp[1024];
- unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
-
- sprintf(tmp, "coordA = currM + lid / %u * %lu;\n", l1Pans, dims[1].y);
- kgenAddStmt(ctx, tmp);
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "k = currM + %lu;\n", dims[0].y);
- }
- else {
- strcpy(tmp, "k = 0;\n");
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genInvertDiagBlock(
- struct KgenContext *ctx,
- const BlasGenSettings *gset,
- const ZeroFuncs *zeroFuncs)
-{
- char tmp[1024];
- const CLBLASKernExtra *kextra = gset->kextra;
- const SubproblemDim *subdims = gset->subdims;
- size_t pitchA;
-
- pitchA = matrBlockPitch(subdims, MATRIX_A, kextra->dtype, clblasLeft);
-
- sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]);
- kgenAddStmt(ctx, tmp);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
- sprintf(tmp, "if (lid < %lu) {\n"
- " tempC[lid * %lu + lid] = %s;\n"
- "}\n",
- subdims[0].bwidth, pitchA, strOne(kextra->dtype));
- kgenAddStmt(ctx, tmp);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
- }
-
- sprintf(tmp, "if (lid < %lu)", subdims[0].y);
- kgenBeginBranch(ctx, tmp);
- sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? "
- "M - currM : %lu);\n",
- subdims[0].y, subdims[0].y);
- kgenAddStmt(ctx, tmp);
- kgenEndBranch(ctx, NULL);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
-}
-
-static void
-genMulOnDiagBlock(
- struct KgenContext *ctx,
- BlasGenSettings *gset,
- const TileMulOpts *mulOpts)
-{
- char tmp[1024];
- const SubproblemDim *dims = gset->subdims;
- const CLBLASKernExtra *kextra = gset->kextra;
- unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
- TileMulOpts optsNew;
- size_t pitchAC;
- const char *ptrName;
- Tile *tile;
- BlasGenSettings gsetNew;
-
- pitchAC = matrBlockPitch(dims, MATRIX_C, kextra->dtype, clblasRight);
- ptrName = dtypeUPtrField(kextra->dtype);
-
- memcpy(&optsNew, mulOpts, sizeof(optsNew));
- optsNew.memA = CLMEM_LOCAL_MEMORY;
- optsNew.memB = CLMEM_LOCAL_MEMORY;
- optsNew.flags &= ~(TILEMUL_TRA | TILEMUL_GLOBAL_CYCLIC | TILEMUL_CONJA);
- optsNew.flags |= TILEMUL_TRB;
- optsNew.memA = CLMEM_LOCAL_MEMORY;
- optsNew.memB = CLMEM_LOCAL_MEMORY;
- gset->varNames.A = "utmpA";
- gset->varNames.B = "utmpC";
-
- sprintf(tmp, "utmpA.%s = tempA + lid / %u * %lu;\n"
- "utmpC.%s = tempC + lid %% %u * %lu;\n\n",
- ptrName, l1Pans, pitchAC * dims[1].y,
- ptrName, l1Pans, pitchAC * dims[1].x);
- kgenAddStmt(ctx, tmp);
-
- memcpy(&gsetNew, gset, sizeof(gsetNew));
- gsetNew.subdims[1].bwidth = dims[1].y;
-
- // Configure the tile descriptors to deal with tile of needed sizes.
- tile = &gsetNew.tileA;
- tile->nrRows = (unsigned int)dims[1].y;
- tile->nrCols = (unsigned int)dims[1].y;
- tile->trans = false;
- tile = &gsetNew.tileBX;
- tile->nrRows = (unsigned int)dims[1].y;
- tile->nrCols = (unsigned int)dims[1].x;
- tile->trans = true;
- tileMulGen(ctx, &gsetNew, &optsNew);
-
- gset->varNames.A = "uA";
- gset->varNames.B = "uB";
-}
-
-static void
-genOneTrsmPass(
- struct KgenContext *ctx,
- BlasGenSettings *gset,
- const char *updateResFnRev,
- const char *updateResGenericFnRev,
- CopyBufFuncs *copyFuncs,
- ZeroFuncs *zeroFuncs,
- bool isTailPass)
-{
- const CLBLASKernExtra *kextra = gset->kextra;
- CLBLASKernExtra kextraTmp;
- KernelExtraFlags kflags = kextra->flags;
- char tmp[1024];
- DataType dtype = kextra->dtype;
- unsigned int vecLen = gset->kextra->vecLen;
- SubproblemDim *subdims = gset->subdims;
- int tra, trb;
- UpdateResultFlags upFlags;
- TilePostFetchPrivate pfpriv;
- TileMulOpts mulOpts;
- TailFetch tf;
- TailStatus tailStatus = 0;
-
- memset(&pfpriv, 0, sizeof(pfpriv));
-
- // multiply options
- mulOpts.memA = CLMEM_GLOBAL_MEMORY;
- mulOpts.memB = CLMEM_GLOBAL_MEMORY;
- mulOpts.core = TILEMUL_MAD;//TILEMUL_MULADD;
- mulOpts.postFetch = NULL;
- mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
- mulOpts.flags |= TILEMUL_EXTERN_RDECL;
- mulOpts.flags |= getCyclicFlags(subdims, kflags, isTailPass, vecLen);
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
-
- tf = checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra, MATRIX_B,
- false, false);
- if (trb) {
- tf &= ~FETCH_TAIL_COL;
- }
-
- /*
- * For lower triangular matrix we proceed upto the diagonal, so we
- * can't exceed matrix bound and zeroing is not needed
- */
- if (isMatrixUpper(kflags)) {
- tf |= checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra,
- MATRIX_A, false, false);
- if (tra && trb) {
- tf &= ~FETCH_TAIL_COL;
- }
- }
-
- if (tf != FETCH_NO_TAILS) {
- memset(&pfpriv, 0, sizeof(pfpriv));
- pfpriv.funcID = CLBLAS_TRSM;
- pfpriv.gset = gset;
- }
-
- // loop over M
- if (!isTailPass) {
- sprintf(tmp, "for (m0 = 0; m0 < M / %lu * %lu; m0 += %lu)",
- subdims->y, subdims->y, subdims->y);
- kgenBeginBranch(ctx, tmp);
- }
-
- genSetupCoordinates(ctx, subdims, kflags);
- genZeroResult(ctx, dtype, subdims, vecLen);
-
- if (!isMatrixUpper(kflags) && isTailPass) {
- // skip update loop is the matrix consist of the single block
- sprintf(tmp, "if (M > %lu)", subdims->y);
- kgenBeginBranch(ctx, tmp);
- }
-
- // Avoid tail adjusting along M.
-
- memcpy(&kextraTmp, kextra, sizeof(kextraTmp));
- kextraTmp.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
-
- // update loop is not needed for tail of an upper triangular matrix
- if (!(isTailPass && isMatrixUpper(kflags))) {
- if (isTailPass || (kflags & KEXTRA_TAILS_N)) {
- kgenBeginBranch(ctx, "if (coordB < N)");
- }
-
- gset->kextra = &kextraTmp;
- tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, gset, NULL);
- gset->kextra = kextra;
-
- genInternalLoopCtl(ctx, subdims, kflags); // loop over K
-
- // multiplication for the step-by-step block updating
- subdims[0].bwidth = subdims[1].bwidth;
- tileMulGen(ctx, gset, &mulOpts);
- subdims[0].bwidth = subdims[0].y;
-
- genInternalLoopEnd(ctx); // loop over K
- kgenAddBlankLine(ctx);
-
- // invoke once again, in order to process tails along K
- if (isMatrixUpper(kflags) && (tf != FETCH_NO_TAILS)) {
- subdims[0].bwidth = subdims[1].bwidth;
-
- if (!(tra && trb)) {
- mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL;
- }
- mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
-
- mulOpts.postFetchPriv = &pfpriv;
- mulOpts.postFetch = defaultTilePostFetch;
-
- subdims[0].bwidth = subdims[1].bwidth;
- tileMulGen(ctx, gset, &mulOpts);
- subdims[0].bwidth = subdims[0].y;
-
- mulOpts.postFetch = NULL;
- mulOpts.postFetchPriv = NULL;
- }
-
- gset->kextra = &kextraTmp;
- checkGenRestoreTailCoords(ctx, gset, tailStatus);
- gset->kextra = kextra;
-
- if (isTailPass || (kflags & KEXTRA_TAILS_N)) {
- kgenEndBranch(ctx, NULL);
- }
- }
- else if (!trb && (kflags & KEXTRA_TAILS_N)) {
- tailStatus |= TAIL_B_RAISED;
- }
-
- mulOpts.flags &= ~(TILEMUL_WRAP_AROUND_TAIL | TILEMUL_GLOBAL_CYCLIC_A |
- TILEMUL_GLOBAL_CYCLIC_K);
-
- if (!isMatrixUpper(kflags) && isTailPass) {
- /*
- * end of branch for non single block tail processing of
- * the lower triangular matrix
- */
- kgenEndBranch(ctx, NULL);
- }
-
- /*
- * Final phase: update the accumulated result, multiply on an inverted
- * block and write back the result
- */
- if (isMatrixUpper(kflags) || ((kflags & KEXTRA_VENDOR_AMD) != 0)) {
- kgenAddStmt(ctx, "k0 = currM;\n");
- }
- else {
- kgenAddStmt(ctx, "k0 = m0;\n");
- }
-
- genReadDiagBlock(ctx, subdims, dtype, copyFuncs, zeroFuncs,
- kflags, 'C');
- genInvertDiagBlock(ctx, gset, zeroFuncs);
-
- // Avoid generating not executed non optimal path
- gset->kextra = &kextraTmp;
- if (isTailPass) {
- kextraTmp.flags |= (KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER);
- }
- genUpdateIntermTrsmResult(ctx, gset, updateResFnRev,
- updateResGenericFnRev, true);
- gset->kextra = kextra;
-
- /*
- * Heap to LDS.
- * Zero unuseful part along columns since it will have an influence
- * on the result at multiplication on an inverted block
- */
- if (isTailPass) {
- genZeroResultTrash(ctx, &subdims[1], kextra);
- }
- genLdsCopy(ctx, gset);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- genZeroResult(ctx, dtype, subdims, vecLen);
-
- genMulOnDiagBlock(ctx, gset, &mulOpts);
-
- // write back the tile evaluated
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= tailStatusToUpresFlags(tailStatus);
- upFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
- setupVdepUpresFlags(kflags, &upFlags);
-
- gset->kextra = &kextraTmp;
-
- genResultUpdateWithFlags(ctx, CLBLAS_TRSM, gset, upFlags,
- NULL, NULL, NULL);
- gset->kextra = kextra;
-
- kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
- }
- else {
- sprintf(tmp, "currM += %lu;\n", subdims[0].y);
- }
- kgenAddStmt(ctx, tmp);
-
- if (!isTailPass) {
- kgenEndBranch(ctx, NULL); // loop over M
- }
-}
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- char tmp[1024];
- struct KgenContext *ctx;
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- KernelExtraFlags kflags = kextra->flags;
- DataType dtype = kextra->dtype;
- BlasGenSettings gset;
- char updateResFnRev[FUNC_NAME_MAXLEN];
- char updateResGenericFnRev[FUNC_NAME_MAXLEN];
- CopyBufFuncs copyFuncs;
- ZeroFuncs zeroFuncs;
- UpdateResultFlags upFlags;
- const char *ptrName;
- bool b;
- ssize_t ret;
- unsigned int l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);
- bool tailMarker[2] = {false, true};
- int triang;
- int i;
-
- if (pgran->wgDim != 1) {
- return -EINVAL;
- }
-
- if (kflags & KEXTRA_TAILS_M) {
- kflags |= KEXTRA_TAILS_M_LOWER;
- }
- if (kflags & KEXTRA_TAILS_N) {
- kflags |= KEXTRA_TAILS_N_LOWER;
- }
- if (kflags & KEXTRA_TAILS_K) {
- kflags |= KEXTRA_TAILS_K_LOWER;
- }
- kextra->flags = kflags;
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- return -ENOMEM;
- }
-
- triang = isMatrixUpper(kflags);
-
- memset(&gset, 0, sizeof(gset));
- memcpy(gset.subdims, subdims, sizeof(gset.subdims));
- gset.kextra = kextra;
- gset.pgran = pgran;
-
- initKernelVarNames(&gset.varNames);
-
- b = isDoubleBasedType(dtype);
- kgenDeclareUptrs(ctx, b);
- if (isComplexType(dtype)) {
- genComplexMathOperators(ctx, dtype);
- }
-
- /*
- * For intermediate result after blocks modification.
- * Take into account tails adjusting
- */
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
-
- if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) &&
- (kflags & KEXTRA_TAILS_N)) {
-
- upFlags |= UPRES_TAIL_COL;
- }
-
- setupVdepUpresFlags(kflags, &upFlags);
- initTiles(&gset);
- genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
- updateResGenericFnRev);
-
- generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_A);
- generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
- ZF_MATRIX_A);
-
- //matrix inversion function
- genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags);
- kgenAddBlankLine(ctx);
-
- // now, generate the kernel
- declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
- true);
- ret = kgenBeginFuncBody(ctx);
-
- declareLocalVariables(ctx, &gset);
- prepareTilesForMainLoop(&gset);
-
- sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x);
- kgenAddStmt(ctx, tmp);
- genInitCurrM(ctx, subdims, kflags);
-
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- kgenAddStmt(ctx, "A += offA;\n");
- }
- genTrxmBMatrShift(ctx, kflags, false);
-
- ptrName = dtypeUPtrField(dtype);
- sprintf(tmp, "uA.%s = A;\n"
- "uB.%s = B;\n\n",
- ptrName, ptrName);
- kgenAddStmt(ctx, tmp);
-
- /*
- * B matrix is divided on panels, each work group
- * multiply such a panel on the whole matrix A.
- */
-
- sprintf(tmp, "coordB = gid * %lu + lid %% %u * %lu;\n",
- subdims[0].x, l1Pans, subdims[1].x);
- kgenAddStmt(ctx, tmp);
-
- for (i = 0; i < 2; i++) {
- b = (i) ? tailMarker[1 - triang] : tailMarker[triang];
- if (!b || (kflags & KEXTRA_TAILS_M)) {
- genOneTrsmPass(ctx, &gset, updateResFnRev, updateResGenericFnRev,
- &copyFuncs, &zeroFuncs, b);
- }
- }
-
- kgenEndFuncBody(ctx);
- ret = kgenAddBlankLine(ctx);
-
- if (!ret) {
- ret = (ssize_t)kgenSourceSize(ctx) + 1;
- }
-
- destroyKgenContext(ctx);
-
- return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs)
-{
- cl_ulong sizeA, sizeC;
- const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
-
- /*
- * It's needed one block for matrix A,
- * and one block of size maximal of this one for
- * matrix A and matrix C
- */
-
- sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
- sizeC = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
- if (sizeA > sizeC) {
- sizeC = sizeA;
- }
-
- return ((sizeA + sizeC) * dtypeSize(dtype) <= ldsSize);
-}
-
-static SolverFlags
-solverFlags(void)
-{
- return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra)
-{
- const CLBlasKargs *blasArgs = (CLBlasKargs*)params;
- KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags;
- int idx = 7;
-
- initSizeKarg(&args[0], blasArgs->M);
- initSizeKarg(&args[1], blasArgs->N);
- assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
- initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
- initSizeKarg(&args[4], blasArgs->lda.matrix);
- initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
- initSizeKarg(&args[6], blasArgs->ldb.matrix);
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offA);
- }
- if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offBX);
- }
-}
-
-static void
-fixupArgs(void *args, SubproblemDim *subdims, void *extra)
-{
- (void)extra;
- (void)subdims;
-
- fixupTrxmKargs((CLBlasKargs*)args);
-}
-
-void
-initTrsmCachedPattern(MemoryPattern *mempat)
-{
- mempat->name = "Cached global memory based block trsm";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 0;
- mempat->sops = &trsmSops;
-
- mpatExtra.aMset = CLMEM_LEVEL_L1;
- mpatExtra.mobjA = CLMEM_BUFFER;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-}