summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c')
-rw-r--r--external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c649
1 files changed, 0 insertions, 649 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c b/external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c
deleted file mode 100644
index c2dd87a4..00000000
--- a/external/clBLAS/src/library/blas/gens/legacy/trsm_lds.c
+++ /dev/null
@@ -1,649 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * LDS based trsm generator
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-#include <matrix_dims.h>
-
-#include "../init.h"
-#include "blas_kgen_legacy.h"
-#include "gen_helper_legacy.h"
-#include "trsm_kgen_legacy.h"
-#include "../trxm_common.h"
-#include "../trsm_kgen.h"
-
-static CLBLASMpatExtra mpatExtra;
-
-/*
- * template for memory object based trsm preparation part
- * for one dimensional work space
- */
-static const char *trsmPrep1D =
- "uint m0, k0;\n"
- "__local %s tempA[%lu];\n"
- "__local %s tempC[%lu];\n"
- "%s c[%u];\n"
- "int lid, gid;\n"
- "%s" // groups per Panel variable
- "uint currM, currN;\n"
- "uint x, y;\n"
- "uint2 coordA, coordB;\n"
- "\n"
- "lid = get_local_id(0);\n"
- "gid = get_global_id(0) / %u;\n" // group ID
- "\n";
-
-static const char *readSquareBlock =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // just read with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockOpt =
- // just read with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
-
-static const char *readSquareBlockTrans =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // read and transpose with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- // read and transpose with slow function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockTransOpt =
- // read and transpose with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
-
-static const char *readRectBlock =
- "y = (currN + %lu <= N) ? %lu : N - currN;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // just read with an optimized function
- " %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- " %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
- "}\n\n";
-
-static const char *readRectBlockOpt =
- // just read with an optimized function
- "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
-
-static const char *readRectBlockTrans =
- "y = (currN + %lu <= N) ? %lu : N - currN;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // read and transpose with an optimized function
- " %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- // read and transpose with slow function
- " %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
- "}\n\n";
-
-static const char *readRectBlockTransOpt =
- // read and transpose with an optimized function
- "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs);
-
-static SolverFlags
-solverFlags(void);
-
-static SolverOps solverOps = {
- generator,
- assignKargs,
- isFitToLDS,
- NULL,
- NULL,
- NULL,
- NULL,
- solverFlags,
- NULL, //fixupArgs
- NULL, //getDefaultDecomp
- NULL, //getDecompList
- NULL,
- NULL
-};
-
-static void
-genZeroResult(
- struct KgenContext *ctx,
- DataType dtype,
- const SubproblemDim *dims)
-{
- unsigned int n;
- char tmp[1024];
- unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-
- getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
-
- sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
- " c[x] = 0;\n"
- "}\n\n", n);
-
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepareSquareBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- DataType dtype,
- const CopyBufFuncs *copyFuncs,
- const ZeroFuncs *zeroFuncs,
- KernelExtraFlags kflags,
- char c)
-{
- char tmp[1024];
- size_t pitch;
- const char *readBlock;
- bool tra;
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
-
- if (!(kflags & KEXTRA_TAILS_M)) {
- readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
- sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
- }
- else {
- readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
- sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
- dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
- zeroFuncs->names[MATRIX_A], c,
- copyFuncs->readGeneric[MATRIX_A], c, pitch);
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepareRectBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- DataType dtype,
- const CopyBufFuncs *copyFuncs,
- const ZeroFuncs *zeroFuncs,
- KernelExtraFlags kflags,
- char c)
-{
- char tmp[1024];
- size_t pitch;
- const char *readBlock;
- bool trb;
-
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
- pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft);
-
- if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) {
- readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
- sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
- }
- else {
- readBlock = (trb) ? readRectBlockTrans : readRectBlock;
- sprintf(tmp, readBlock, dim->x, dim->x, dim->bwidth, dim->bwidth,
- dim->x, dim->bwidth, copyFuncs->read[MATRIX_B], c,
- zeroFuncs->names[MATRIX_B], c,
- copyFuncs->readGeneric[MATRIX_B], c, pitch);
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genZeroBlockA(
- struct KgenContext *ctx,
- const ZeroFuncs *zeroFuncs)
-{
- char tmp[1024];
- sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]);
- kgenAddStmt(ctx, tmp);
-}
-
-/*
- * Generate control block of the loop over K
- * Two kind of loops: without triangle block and only triangle block
- */
-static void
-genInternalLoopCtl(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- KernelExtraFlags kflags,
- bool triangPart)
-{
- char tmp[1024];
-
- (void)triangPart;
-
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)",
- dim->bwidth, dim->bwidth);
- }
- else {
- sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)",
- dim->bwidth);
- }
-
- kgenBeginBranch(ctx, tmp);
-}
-
-static void
-genInitCurrM(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- KernelExtraFlags kflags)
-{
- char tmp[1024];
-
- if (isMatrixUpper(kflags)) {
- /* start from the last block */
- sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y);
- kgenAddStmt(ctx, tmp);
- }
- else {
- kgenAddStmt(ctx, "currM = 0;\n");
- }
-}
-
-static void
-initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
-{
- kvars->A = "A";
- kvars->B = "B";
-
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
- kvars->coordA = "coordA.x";
- }
- else {
- kvars->coordA = "coordA.y";
- }
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
- kvars->coordB = "coordB.x";
- }
- else {
- kvars->coordB = "coordB.y";
- }
- kvars->sizeM = "M";
- kvars->sizeN = "N";
- kvars->sizeK = "origM";
-}
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- struct KgenContext *ctx;
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- KernelExtraFlags kflags = kextra->flags;
- char tmp[1024];
- char blkmul[FUNC_NAME_MAXLEN];
- char updateResFn[FUNC_NAME_MAXLEN];
- char updateResGenericFn[FUNC_NAME_MAXLEN];
- char updateResFnRev[FUNC_NAME_MAXLEN];
- char updateResGenericFnRev[FUNC_NAME_MAXLEN];
- char copyPLFn[FUNC_NAME_MAXLEN];
- char *s1 = "";
- const char *typeName;
- CopyBufFuncs copyFuncs;
- ZeroFuncs zeroFuncs;
- DataType dtype = kextra->dtype;
- ssize_t ret;
- BlasGenSettings gset;
- BlkMulOpts mulOpts;
- size_t pitchAB, pitchC;
- bool b;
- const char *outTypeName;
- unsigned int nrRegs;
- unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
- int tra, trb;
- unsigned int l1Pans;
- char vect[2] = {'y', 'x'};
- UpdateResultFlags upFlags;
-
- if (pgran->wgDim != 1) {
- return -EINVAL;
- }
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- return -ENOMEM;
- }
-
- // at first, generate needed declarations and auxiliary functions
-
- b = isDoubleBasedType(dtype);
- kgenDeclareUptrs(ctx, b);
-
- memset(&gset, 0, sizeof(gset));
- memcpy(gset.subdims, subdims, sizeof(gset.subdims));
- gset.kextra = kextra;
- gset.pgran = pgran;
-
- initKernelVarNames(&gset.varNames, kflags);
-
- if (isComplexType(dtype)) {
- genComplexMathOperators(ctx, dtype);
- }
-
- generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
- BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT);
-
- generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
- ZF_MATRIX_A | ZF_MATRIX_B);
- getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
-
- // functions updating result
- // for the final result
- generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
- updateResGenericFn);
- // for intermediate result after blocks modification
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
- genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
- updateResGenericFnRev);
- // for heaping before multiplying on inverted block
- updateResultGenOld(ctx, &gset, UPRES_SET,
- UPRES_COLUMN_MAJOR | UPRES_USE_LDS, NULL);
- kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
- kgenAddBlankLine(ctx);
-
- // block multiplication function
- mulOpts.aMobj = CLMEM_BUFFER;
- mulOpts.bMobj = CLMEM_BUFFER;
- mulOpts.flags = BLKMUL_SKEW_COLUMN | BLKMUL_OUTPUT_PRIVATE;
- mulOpts.core = BLKMUL_SEPARATE_MULADD;
- ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
- if (ret) {
- destroyKgenContext(ctx);
-
- return -EOVERFLOW;
- }
-
- kgenAddBlankLine(ctx);
- kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
-
- //matrix inversion function
- genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags);
-
- typeName = dtypeBuiltinType(dtype);
-
- // now, generate the kernel
- declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, NULL, false,
- false);
- ret = kgenBeginFuncBody(ctx);
-
- /*
- * Calculate local buffer pitches, and then insert the
- * preparative code
- */
- pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
- pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft);
- sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
- typeName,
- ((pitchC > pitchAB) ? pitchC : pitchAB) * subdims[0].y,
- outTypeName, nrRegs, s1, pgran->wgSize[0]);
- ret = kgenAddStmt(ctx, tmp);
-
- /*
- * B matrix is divided on panels, each work group
- * multiply such a panel on the whole matrix A.
- */
-
- sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x);
- kgenAddStmt(ctx, tmp);
-
- genInitCurrM(ctx, subdims, kflags);
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- kgenAddStmt(ctx, "A += offA;\n");
- }
- genTrxmBMatrShift(ctx, kflags, false);
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
-
- l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
-
- sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
- "coordB.%c = 0;\n\n",
- vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
- kgenAddStmt(ctx, tmp);
-
- // loop over M
- sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y);
- kgenBeginBranch(ctx, tmp);
-
- sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n"
- "coordA.%c = 0;\n\n",
- vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
- kgenAddStmt(ctx, tmp);
-
- genZeroResult(ctx, dtype, subdims);
-
- genInternalLoopCtl(ctx, subdims, kflags, false); // loop over K
-
- genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
- kflags, 'A');
- genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
- kflags, 'C');
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- // multiplication for the step-by-step block updating
- sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n"
- " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
- " (%s*)c, lid %% %lu);\n",
- blkmul, l1Pans, subdims[1].y, pitchAB,
- l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y);
- ret = kgenAddStmt(ctx, tmp);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- genInternalLoopEnd(ctx); // loop over K
- kgenAddBlankLine(ctx);
-
- kgenAddStmt(ctx, "k0 = currM;\n");
- genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
- kflags, 'C');
- genZeroBlockA(ctx, &zeroFuncs);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- if (kflags & KEXTRA_UNIT_DIAGONAL) {
- sprintf(tmp, "if (lid < %lu) {\n"
- " tempC[lid * %lu + lid] = %s;\n"
- "}\n",
- subdims[0].bwidth, pitchAB, strOne(dtype));
- kgenAddStmt(ctx, tmp);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
- }
-
- sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
- kgenBeginBranch(ctx, tmp);
- sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? "
- "M - currM : %lu);\n",
- subdims[0].y, subdims[0].y);
- kgenAddStmt(ctx, tmp);
- kgenEndBranch(ctx, NULL);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
-
- genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
- updateResGenericFnRev, true);
-
- genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- genZeroResult(ctx, dtype, subdims);
-
- // multypling on an inverted block
- sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n"
- " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
- " (%s*)c, lid %% %lu);\n\n",
- blkmul, l1Pans, subdims[1].y, pitchAB,
- l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y);
- ret = kgenAddStmt(ctx, tmp);
-
- // write back the tile evaluated
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= UPRES_EXCEED_PROBLEM_CONDITION;
- genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
- updateResGenericFn, NULL);
-
- kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
- if (isMatrixUpper(kflags)) {
- sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
- }
- else {
- sprintf(tmp, "currM += %lu;\n", subdims[0].y);
- }
- kgenAddStmt(ctx, tmp);
-
- kgenEndBranch(ctx, NULL); // loop over M
-
- kgenEndFuncBody(ctx);
- ret = kgenAddBlankLine(ctx);
-
- if (!ret) {
- ret = (ssize_t)kgenSourceSize(ctx) + 1;
- }
-
- destroyKgenContext(ctx);
-
- return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra)
-{
- const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
- KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags;
- int idx = 7;
-
- initSizeKarg(&args[0], blasArgs->M);
- initSizeKarg(&args[1], blasArgs->N);
- assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
- initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0);
- initSizeKarg(&args[4], blasArgs->lda.matrix);
- initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0);
- initSizeKarg(&args[6], blasArgs->ldb.matrix);
- if (kflags & KEXTRA_STARTM_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offsetM);
- }
- if (kflags & KEXTRA_STARTN_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offsetN);
- }
- if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offA);
- }
- if (kflags & KEXTRA_BX_OFF_NOT_ZERO) {
- initSizeKarg(&args[idx++], blasArgs->offBX);
- }
-}
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs)
-{
- cl_ulong sizeA, sizeB, size;
- const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
-
- /*
- * It's needed one block for each matrix A and B,
- * and one block of size maximal of this one for
- * matrix B and matrix C
- */
-
- sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
- sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
- size = matrBlockSize(dim, MATRIX_C, dtype, kargs->side);
- if (sizeB > size) {
- size = sizeB;
- }
- size += sizeA + sizeB;
-
- return (size * dtypeSize(dtype) <= ldsSize);
-}
-
-static SolverFlags
-solverFlags(void)
-{
- return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
-}
-
-void
-initTrsmLdsPattern(MemoryPattern *mempat)
-{
- mempat->name = "LDS based block trsm";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 1;
- mempat->sops = &solverOps;
-
- mpatExtra.aMset = CLMEM_LEVEL_LDS;
- mpatExtra.bMset = CLMEM_LEVEL_LDS;
- mpatExtra.mobjA = CLMEM_BUFFER;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-}