summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/legacy/trsm_img.c')
-rw-r--r--external/clBLAS/src/library/blas/gens/legacy/trsm_img.c1165
1 files changed, 0 insertions, 1165 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c b/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c
deleted file mode 100644
index 54127c7b..00000000
--- a/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c
+++ /dev/null
@@ -1,1165 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * Image based trsm generator
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-
-#include <matrix_dims.h>
-
-#include "blas_kgen_legacy.h"
-#include "gen_helper_legacy.h"
-#include "trsm_kgen_legacy.h"
-#include "../gen_helper.h"
-#include "../trsm_kgen.h"
-#include <dis_warning.h>
-
-static const char *trsmImDecl =
- "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
- "void __kernel\n"
- "%ctrsmIm(\n"
- " uint %c,\n"
- " uint %c,\n"
- " %s alpha,\n"
- " __read_only image2d_t A,\n"
- " __global %s *B,\n"
- " uint ldb,\n"
- " uint startRow,\n"
- " uint finishRow,\n"
- " uint offB)\n";
-
-/*
- * template for memory object based trsm preparation part
- * for one dimensional work space
- */
-static const char *trsmImPrep1D =
- "uint m0, k0;\n"
- "__local %s tempC[%lu];\n"
- "%s c[%u];\n"
- "const int lid = get_local_id(0);\n"
- "const int skew = lid %% %lu;\n"
- "%s" // groups per Panel variable
- "uint blockN;\n"
- "uint x, y, imx, imy;\n"
- "uint2 coordA, coordB;\n"
- "\n"
- "const uint currN = get_global_id(0) / %u * %lu;\n" // group ID
- "\n";
-
-static const char *readRectBlock =
- "y = (currN + %lu <= N) ? %lu : N - currN;\n"
- "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // just read with an optimized function
- " %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- " %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
- "}\n\n";
-
-static const char *readRectBlockOpt =
- // just read with an optimized function
- "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
-
-static const char *readRectBlockTrans =
- "y = (currN + %lu <= N) ? %lu : N - currN;\n"
- "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // read and transpose with an optimized function
- " %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- // read and transpose with slow function
- " %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
- "}\n\n";
-
-static const char *readRectBlockTransOpt =
- // read and transpose with an optimized function
- "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
-
-static ssize_t
-wrapper(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static ssize_t
-prepGenerator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs);
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *dims,
- const PGranularity *pgran,
- const void *args,
- const void *extra);
-
-
-static void
-imgPackMode(
- const void *extra,
- const SubproblemDim *dims,
- int dataID,
- unsigned int *packRate,
- clblasOrder *packOrder);
-
-static SolverFlags
-solverFlags(void);
-
-static SolverOps solverOps = {
- wrapper,
- assignKargs,
- isFitToLDS,
- NULL,
- NULL,
- calcNrThreads,
- imgPackMode,
- solverFlags,
- NULL, //fixupArgs
- NULL, //getDefaultDecomp
- NULL, //getDecompList
- NULL,
- NULL
-};
-
-static CLBLASMpatExtra mpatExtra;
-
-/* Prepare A kernel begin */
-
-static const char *trsmPrepDecl =
- "void __kernel\n"
- "%ctrsmPrepare(\n"
- " uint %c,\n"
- " __global %s *A,\n"
- " uint lda,\n"
- " __write_only image2d_t imA,\n"
- " uint startRow,\n"
- " uint offA)\n";
-
-/*
- * template for memory object based trsm preparation part
- * for one dimensional work space
- */
-static const char *trsmPrep1D =
- "__local %s tempA[%lu];\n"
- "__local %s tempC[%lu];\n"
- "int lid, gid;\n"
- "uint currM, k0;\n"
- "uint x, y, imx, imy;\n"
- "\n"
- "lid = get_local_id(0);\n"
- "gid = get_global_id(0) / %u;\n" // group ID
- "A += offA;\n"
- "\n";
-
-static const char *readSquareBlock =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // just read with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockOpt =
- // just read with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
-
-static const char *readSquareBlockTrans =
- "y = (currM + %lu <= M) ? %lu : M - currM;\n"
- "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
- "if ((y == %lu) && (x == %lu)) {\n"
- // read and transpose with an optimized function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
- "}\n"
- "else {\n"
- " %s((__local float4*)temp%c);\n" // zeroing
- " barrier(CLK_LOCAL_MEM_FENCE);\n"
- // read and transpose with slow function
- " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
- "}\n\n";
-
-static const char *readSquareBlockTransOpt =
- // read and transpose with an optimized function
- "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
-
-
-static bool
-useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb)
-{
- unsigned int vecLen;
-
- vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-
- return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen)));
-}
-
-static size_t
-calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul)
-{
- size_t ret;
- size_t tsize;
-
- tsize = dtypeSize(dtype);
- ret = (transpMul) ? dim->x : dim->bwidth;
- ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize;
-
- return ret;
-}
-
-static void
-genPrepareSquareBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- DataType dtype,
- const CopyBufFuncs *copyFuncs,
- const ZeroFuncs *zeroFuncs,
- bool tra,
- char c,
- bool opt)
-{
- char tmp[1024];
- size_t pitch;
- const char *readBlock;
-
- pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
- if (opt) {
- readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
- sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
- }
- else {
- readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
- sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
- dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
- zeroFuncs->names[MATRIX_A], c,
- copyFuncs->readGeneric[MATRIX_A], c, pitch);
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepZeroBlockC(
- struct KgenContext *ctx,
- const ZeroFuncs *zeroFuncs)
-{
- char tmp[1024];
- sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]);
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genWriteBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- const CopyBufFuncs *copyFuncs)
-{
- char tmp[1024];
-
- sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n",
- copyFuncs->write, dim[0].y, dim[0].y, dim[0].y);
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer
-{
- kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, "
- "uint width, uint *y, "
- "uint *x)\n");
- kgenBeginFuncBody(ctx);
- if (isU) {
- //n from beginning
- kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n");
- kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - "
- "sqrt((2 * width + 1) *"
- "(2 * width + 1) - 8 * n)) / 2;\n");
- kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n");
- }
- else {
- //n from beginning
- kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n");
- kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n");
- kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n");
- }
- kgenEndFuncBody(ctx);
-
- kgenAddBlankLine(ctx);
-}
-
-static void
-genGetImagePos(
- struct KgenContext *ctx,
- const SubproblemDim *subdims,
- DataType dtype,
- const char *blockName,
- bool tra) //n -> x,y image
-{
- char tmp[1024];
- const char *parName;
- const char *op[2] = {"/", "%"};
-
- parName = (tra) ? "bpc" : "bpr";
-
- sprintf(tmp, "imy = %s %s %s * %lu;\n"
- "imx = (%s %s %s) * %lu;\n",
- blockName, op[tra], parName, subdims[0].y,
- blockName, op[1 - tra], parName,
- subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4));
- kgenAddStmt(ctx, tmp);
-}
-
-// global memory to image converter
-static ssize_t
-prepGenerator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- struct KgenContext *ctx;
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- char tmp[1024];
- const char *typeName;
- CopyBufFuncs copyFuncs;
- ZeroFuncs zeroFuncs;
- char fpref;
- DataType dtype = kextra->dtype;
- KernelExtraFlags kflags = kextra->flags;
- ssize_t ret;
- size_t pitchAB;
- bool b;
- bool tra, trb, isU, transpMul;
- BlasGenSettings gset;
-
- if (pgran->wgDim != 1) {
- return -EINVAL;
- }
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- return -ENOMEM;
- }
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
- isU = isMatrixUpper(kflags);
-
- // at first, generate needed declarations and auxiliary functions
-
- b = isDoubleBasedType(dtype);
- kgenDeclareUptrs(ctx, b);
-
- if (isComplexType(dtype)) {
- genComplexMathOperators(ctx, dtype);
- }
-
- memset(&gset, 0, sizeof(gset));
- memcpy(gset.subdims, subdims, sizeof(gset.subdims));
- gset.kextra = kextra;
- gset.pgran = pgran;
-
- generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
- BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE);
- generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
- ZF_MATRIX_A);
-
- //matrix inversion function
- genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU);
-
- //coordinates calculation
- getBufferPos(ctx, isU);
-
- typeName = dtypeBuiltinType(dtype);
- fpref = dtypeToBlasPrefix(dtype);
-
- // now, generate the kernel
-
- sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName,
- typeName, typeName, typeName);
-
- kgenDeclareFunction(ctx, tmp);
- ret = kgenBeginFuncBody(ctx);
-
- transpMul = useTransposedMul(subdims, dtype, trb);
- if (!transpMul) {
- sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n",
- subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)));
- }
- else {
- sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n",
- subdims[0].y);
- }
- kgenAddStmt(ctx, tmp);
-
- /*
- * Calculate local buffer pitches, and then insert the
- * preparative code
- */
- pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
- sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
- typeName, pitchAB * subdims[0].y, pgran->wgSize[0]);
- ret = kgenAddStmt(ctx, tmp);
-
- sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n",
- subdims[0].y, subdims[0].y - 1, subdims[0].y);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "currM *= %lu;\n"
- "k0 *= %lu;\n", subdims[0].y, subdims[0].y);
- kgenAddStmt(ctx, tmp);
-
- genGetImagePos(ctx, subdims, dtype, "gid", transpMul);
-
- kgenBeginBranch(ctx, "if (currM == k0)");
- genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
- tra, 'A', !(kextra->flags & KEXTRA_TAILS_M));
- genPrepZeroBlockC(ctx, &zeroFuncs);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
- sprintf(tmp, "if (lid < %lu) {\n"
- " tempA[lid * %lu + lid] = %s;\n"
- "}\n",
- subdims[0].bwidth, pitchAB, strOne(dtype));
- kgenAddStmt(ctx, tmp);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenAddBlankLine(ctx);
- }
-
- sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
- kgenBeginBranch(ctx, tmp);
- sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? "
- "M - currM : %lu);\n",
- subdims[0].y, subdims[0].y);
- kgenAddStmt(ctx, tmp);
- kgenEndBranch(ctx, NULL);
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenEndBranch(ctx, NULL);
-
- kgenBeginBranch(ctx, "else");
- genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs, tra,
- 'C', !(kextra->flags & KEXTRA_TAILS_M));
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenEndBranch(ctx, NULL);
-
- genWriteBlock(ctx, subdims, &copyFuncs);
- kgenEndFuncBody(ctx);
- ret = kgenAddBlankLine(ctx);
-
- if (!ret) {
- ret = (ssize_t)kgenSourceSize(ctx) + 1;
- }
-
- destroyKgenContext(ctx);
-
- return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static void
-genZeroResult(
- struct KgenContext *ctx,
- DataType dtype,
- const SubproblemDim *dims)
-{
- unsigned int n;
- char tmp[1024];
- unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-
- getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
-
- sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
- " c[x] = 0;\n"
- "}\n\n", n);
-
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepareRectBlock(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- DataType dtype,
- const CopyBufFuncs *copyFuncs,
- const ZeroFuncs *zeroFuncs,
- bool trb,
- char c,
- bool opt)
-{
- char tmp[1024];
- size_t pitch;
- const char *readBlock;
- size_t bsizes[2] = {dim->bwidth, dim->x};
-
- /*
- * NOTE: in case of accessing to B in the non transposed way
- * block multiplication is done with transposed block B
- */
- pitch = calcPitchB(dim, dtype, !trb);
- if (opt) {
- readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
- sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
- }
- else {
- readBlock = (trb) ? readRectBlockTrans : readRectBlock;
- sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb],
- bsizes[1 - trb], bsizes[trb], bsizes[1 - trb],
- copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c,
- copyFuncs->readGeneric[MATRIX_B], c, pitch);
- }
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-getNblock(struct KgenContext *ctx, bool isU) //x, y -> n
-{
- kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, "
- "uint width, uint *n)\n");
- kgenBeginFuncBody(ctx);
- if (isU) {
- kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - "
- "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n");
- }
- else {
- kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n");
- }
- kgenEndFuncBody(ctx);
- kgenAddBlankLine(ctx);
-}
-
-static void
-genMultiplication(
- struct KgenContext *ctx,
- const SubproblemDim *dims,
- DataType dtype,
- const char *blkmulName,
- BlkMulFlags mulFlags)
-{
- char tmp[1024];
- size_t u;
- unsigned int l1Pans;
-
- l1Pans = (unsigned int)(dims[0].x / dims[1].x);
- if (mulFlags & BLKMUL_TRANSPOSED_B) {
- u = 1;
- }
- else {
- u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft);
- }
-
- // find image position and invoke the multiplier
- sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, "
- "(M + %lu) / %lu, &blockN);\n",
- dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y);
- kgenAddStmt(ctx, tmp);
- genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0);
- sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n"
- " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
- " c, skew);\n",
- blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u);
- kgenAddStmt(ctx, tmp);
-}
-
-static void
-genReorderSolution(
- struct KgenContext *ctx,
- const SubproblemDim *subdims,
- const char *outTypeName,
- unsigned int colRegs)
-{
- char tmp[1024], tmp1[1024];
- char *p;
- unsigned i;
-
- sprintf(tmp, "void\n"
- "reorderResult(%s *c, int skew)",
- outTypeName);
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
-
- sprintf(tmp, "%s tmp;\n"
- "int i, j;\n",
- outTypeName);
- kgenAddStmt(ctx, tmp);
-
- p = tmp1;
- for (i = 0; i < colRegs; i++) {
- unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i;
-
- sprintf(p, "\n"
- " tmp = c[%u];\n"
- " for (j = %lu; j >= 0; j--) {\n"
- " c[(j+1) * %u + %u] = c[j * %u + %u];\n"
- " }\n"
- " c[%u] = tmp;\n",
- k, subdims[1].y - 2, colRegs, i, colRegs, i, i);
- p += strlen(p);
- }
-
- sprintf(tmp, "\n"
- "for (i = 0; i < skew; i++) {\n"
- "%s"
- "}\n"
- "\n",
- tmp1);
- kgenAddStmt(ctx, tmp);
-
- kgenEndFuncBody(ctx);
- kgenAddBlankLine(ctx);
-}
-
-static void
-initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
-{
- kvars->A = "imgA";
- kvars->B = "B";
-
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
- kvars->coordA = "coordA.x";
- }
- else {
- kvars->coordA = "coordA.y";
- }
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
- kvars->coordB = "coordB.x";
- }
- else {
- kvars->coordB = "coordB.y";
- }
-
- kvars->sizeM = "M";
- kvars->sizeN = "N";
- kvars->sizeK = "origM";
-}
-
-// image based kernel generator
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- struct KgenContext *ctx;
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- CLBLASKernExtra kextraTmp = *kextra;
- char tmp[1024], tmp1[1024];
- char blkmul[FUNC_NAME_MAXLEN];
- char updateResFn[FUNC_NAME_MAXLEN];
- char updateResGenericFn[FUNC_NAME_MAXLEN];
- char updateResFnRev[FUNC_NAME_MAXLEN];
- char updateResGenericFnRev[FUNC_NAME_MAXLEN];
- char copyPLFn[FUNC_NAME_MAXLEN];
- char *s1 = "";
- const char *typeName;
- CopyBufFuncs copyFuncs;
- ZeroFuncs zeroFuncs;
- char fpref;
- DataType dtype = kextra->dtype;
- ssize_t ret;
- BlasGenSettings gset;
- BlkMulOpts mulOpts;
- BlkMulFlags mulFlags;
- size_t pitchAB;
- size_t u;
- bool b;
- bool isU;
- bool areTails;
- const char *outTypeName;
- unsigned int nrRegs, colRegs;
- KernelExtraFlags kflags = kextra->flags;
- size_t tsize;
- unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
- UpdateResultFlags upFlags;
- int tra, trb;
- unsigned int l1Pans;
- char vect[2] = {'y', 'x'};
-
- if (pgran->wgDim != 1) {
- return -EINVAL;
- }
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- return -ENOMEM;
- }
-
- tsize = dtypeSize(dtype);
- areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
- isU = isMatrixUpper(kflags);
-
- tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
- l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
-
- /*
- * Force generation of the transposed version of the block
- * reading function with following multiplication with transposed
- * block B to decrease LDS bank conflicts without column skew using.
- * Reverse temporarily the flag of the column-major order for that
- */
- if (useTransposedMul(subdims, dtype, trb)) {
- if (kflags & KEXTRA_COLUMN_MAJOR) {
- kflags &= ~KEXTRA_COLUMN_MAJOR;
- }
- else {
- kflags |= KEXTRA_COLUMN_MAJOR;
- }
- mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B;
- u = subdims[1].y;
- }
- else {
- mulFlags = BLKMUL_SKEW_COLUMN;
- u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype));
- }
-
- ctx = createKgenContext(buf, buflen, true);
- if (ctx == NULL) {
- return -ENOMEM;
- }
-
- // at first, generate needed declarations and auxiliary functions
-
- b = isDoubleBasedType(dtype);
- kgenDeclareUptrs(ctx, b);
-
- kextraTmp.flags = kflags;
- memset(&gset, 0, sizeof(gset));
- memcpy(gset.subdims, subdims, sizeof(gset.subdims));
- gset.kextra = &kextraTmp;
- gset.pgran = pgran;
- initKernelVarNames(&gset.varNames, kextra->flags);
-
- if (isComplexType(dtype)) {
- genComplexMathOperators(ctx, dtype);
- }
-
- generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B);
- /*
- * Temporary kernel extra has been needed to produce inverted block B read.
- * Restore the original one, and restore kflags as well
- */
- gset.kextra = kextra;
- kflags = kextra->flags;
-
- // functions updating result
- // for the final result
- generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
- updateResGenericFn);
- // for intermediate result after blocks modification
- upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
- upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
- genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
- updateResGenericFnRev);
- // for heaping before multiplying on inverted block
- upFlags = UPRES_USE_LDS;
- if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
- upFlags |= UPRES_COLUMN_MAJOR;
- }
- updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL);
- kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
- kgenAddBlankLine(ctx);
-
- generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
- ZF_MATRIX_B | ZF_MATRIX_C);
-
- // block multiplication function
- mulOpts.aMobj = CLMEM_IMAGE;
- mulOpts.bMobj = CLMEM_BUFFER;
- mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags;
- if (isComplexType(dtype)) {
- mulOpts.core = BLKMUL_SEPARATE_MULADD;
- }
- else {
- mulOpts.core = BLKMUL_MAD;
- }
- ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
- if (ret) {
- destroyKgenContext(ctx);
-
- return -EOVERFLOW;
- }
-
- kgenAddBlankLine(ctx);
- kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
-
- typeName = dtypeBuiltinType(dtype);
- fpref = dtypeToBlasPrefix(dtype);
-
- // block number calculation
- getNblock(ctx, isU);
-
- getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
- if (isComplexType(dtype)) {
- colRegs = (unsigned int)subdims[1].x;
- }
- else {
- colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize);
- }
-
- if (mulFlags & BLKMUL_SKEW_ROW) {
- genReorderSolution(ctx, subdims, outTypeName, colRegs);
- }
-
- // now, generate the kernel
-
- if (kflags & KEXTRA_SIDE_RIGHT) {
- sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
- fpref, 'N', 'M', typeName, typeName, typeName, typeName);
- }
- else {
- sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
- fpref, 'M', 'N', typeName, typeName, typeName, typeName);
- }
-
- kgenDeclareFunction(ctx, tmp);
- ret = kgenBeginFuncBody(ctx);
-
- if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
- sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n",
- subdims[0].y / (sizeof(cl_float4) / tsize));
- }
- else {
- sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n",
- subdims[0].y);
- }
- kgenAddStmt(ctx, tmp);
-
- /*
- * Calculate local buffer pitches, and then insert the
- * preparative code
- */
- pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
-
- sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x,
- outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX);
- kgenAddStmt(ctx, tmp);
- kgenAddBlankLine(ctx);
-
- kgenAddStmt(ctx, "B += offB;\n");
- sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
- "coordB.%c = 0;\n\n",
- vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
- kgenAddStmt(ctx, tmp);
-
- /*
- * B matrix is divided on panels, each work group
- * multiply such a panel on the whole matrix A.
- */
-
- // top level loop over M
- if (isU) {
- sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y,
- subdims[0].y); //last block start
- sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)",
- tmp1, subdims[0].y, subdims[0].y);
- ret = kgenBeginBranch(ctx, tmp);
- }
- else {
- sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)",
- subdims[0].y);
- ret = kgenBeginBranch(ctx, tmp);
- }
-
- sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n"
- "coordA.%c = 0;\n\n",
- vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
- kgenAddStmt(ctx, tmp);
-
- genZeroResult(ctx, dtype, subdims);
-
- // loop over K
- if (isU) {
- sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)",
- subdims[0].bwidth, subdims[0].bwidth);
- }
- else {
- sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
- subdims[0].bwidth);
- }
- ret = kgenBeginBranch(ctx, tmp);
-
- genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
- trb, 'C', !areTails);
-
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
- // multiplication in the adjusting loop
- genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
-
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- kgenEndBranch(ctx, NULL); // loop over K
- kgenAddBlankLine(ctx);
-
- if (mulFlags & BLKMUL_SKEW_ROW) {
- kgenAddStmt(ctx, "reorderResult(c, skew);\n");
- }
- kgenAddStmt(ctx, "k0 = m0;\n");
-
- genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
- updateResGenericFnRev, true);
-
- genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
- kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
- genZeroResult(ctx, dtype, subdims);
-
- // multiplication on the inverted block
- genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
- if (mulFlags & BLKMUL_SKEW_ROW) {
- kgenAddStmt(ctx, "reorderResult(c, skew);\n");
- }
-
- // write back the tile evaluated
- upFlags = UPRES_EXCEED_PROBLEM_CONDITION;
- if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) {
- upFlags |= UPRES_COLUMN_MAJOR;
- }
- genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
- updateResGenericFn, NULL);
-
- kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
- // end external loops over panels of matrix A
- kgenEndBranch(ctx, NULL);
- kgenEndFuncBody(ctx);
- ret = kgenAddBlankLine(ctx);
-
- if (!ret) {
- ret = (ssize_t)kgenSourceSize(ctx) + 1;
- }
-
- destroyKgenContext(ctx);
-
- return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static ssize_t
-wrapper(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
- return generator(buf, buflen, subdims, pgran, extra);
- }
- else {
- return prepGenerator(buf, buflen, subdims, pgran, extra);
- }
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra)
-{
- const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
-
- (void)extra;
-
- if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) {
- if (blasArgs->side == clblasLeft) {
- initSizeKarg(&args[0], blasArgs->K);
- initSizeKarg(&args[1], blasArgs->N);
- }
- else {
- initSizeKarg(&args[0], blasArgs->M);
- initSizeKarg(&args[1], blasArgs->K);
- }
- assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
- initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
- initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
- initSizeKarg(&args[5], blasArgs->ldb.matrix);
- if (blasArgs->side == clblasLeft) {
- initSizeKarg(&args[6], blasArgs->offsetM);
- initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM);
- }
- else {
- initSizeKarg(&args[6], blasArgs->offsetN);
- initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN);
- }
- initSizeKarg(&args[8], blasArgs->offBX);
- }
- else {
- if (blasArgs->side == clblasLeft) {
- initSizeKarg(&args[0], blasArgs->M);
- }
- else {
- initSizeKarg(&args[0], blasArgs->N);
- }
- initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
- initSizeKarg(&args[2], blasArgs->lda.matrix);
- initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
- if (blasArgs->side == clblasLeft) {
- initSizeKarg(&args[4], blasArgs->offsetM);
- }
- else {
- initSizeKarg(&args[4], blasArgs->offsetN);
- }
- initSizeKarg(&args[5], blasArgs->offA);
- }
-}
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs)
-{
- cl_ulong sizeA, sizeB, size;
- const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
-
- /*
- * For prepare kernel two square local blocks required.
- * For main kernel two rectangular blocks required.
- * Maximum of these two values checked.
- */
-
- sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
- sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
- size = (sizeA > sizeB) ? sizeA : sizeB;
-
- return (2 * size * dtypeSize(dtype) <= ldsSize);
-}
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *dims,
- const PGranularity *pgran,
- const void *args,
- const void *extra)
-{
- SubproblemDim globDim, offDim;
- const CLBlasKargs *kargs = (const CLBlasKargs*)args;
- size_t width, startBlock, finishBlock;
- bool isU = (kargs->uplo == clblasUpper) ^
- (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight);
-
- (void)extra;
-
- width = kargs->K;
- width = (width + dims[0].bwidth - 1) / dims[0].bwidth;
- kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false);
- kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true);
-
- startBlock = offDim.y / dims[0].bwidth;
- finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth;
-
- if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
- if (isU) {
- threads[0] = ((2 * width - startBlock - finishBlock + 1) *
- (finishBlock - startBlock) / 2) * pgran->wgSize[0];
- }
- else {
- threads[0] = ((1 + finishBlock + startBlock) *
- (finishBlock - startBlock) / 2) * pgran->wgSize[0];
- }
- threads[1] = 0;
- }
- else {
- calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x);
- }
-}
-
-static void
-imgPackMode(
- const void *extra,
- const SubproblemDim *dims,
- int dataID,
- unsigned int *packRate,
- clblasOrder *packOrder)
-{
- bool trb;
- const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
-
- (void)dataID;
-
- trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
- if (trb || isComplexType(kextra->dtype)) {
- *packOrder = clblasRowMajor;
- *packRate = (unsigned int)dims[0].y;
- }
- else {
- *packOrder = clblasColumnMajor;
- *packRate = (unsigned int)dims[0].y;
- }
-}
-
-static SolverFlags
-solverFlags(void)
-{
- return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
-}
-
-void
-initTrsmImgPattern(MemoryPattern *mempat)
-{
- mempat->name = "Image based block trsm";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 1;
- mempat->sops = &solverOps;
- mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
- mpatExtra.bMset = CLMEM_LEVEL_LDS;
- mpatExtra.mobjA = CLMEM_IMAGE;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-}