1 files changed, 0 insertions, 1165 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c b/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c
deleted file mode 100644
index 54127c7b..00000000
--- a/external/clBLAS/src/library/blas/gens/legacy/trsm_img.c
+++ /dev/null
@@ -1,1165 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * Image based trsm generator
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-
-#include <matrix_dims.h>
-
-#include "blas_kgen_legacy.h"
-#include "gen_helper_legacy.h"
-#include "trsm_kgen_legacy.h"
-#include "../gen_helper.h"
-#include "../trsm_kgen.h"
-#include <dis_warning.h>
-
-static const char *trsmImDecl =
-    "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n"
-    "void __kernel\n"
-    "%ctrsmIm(\n"
-    "    uint %c,\n"
-    "    uint %c,\n"
-    "    %s alpha,\n"
-    "    __read_only image2d_t A,\n"
-    "    __global %s *B,\n"
-    "    uint ldb,\n"
-    "    uint startRow,\n"
-    "    uint finishRow,\n"
-    "    uint offB)\n";
-
-/*
- *  template for memory object based trsm preparation part
- *  for one dimensional work space
- */
-static const char *trsmImPrep1D =
-    "uint m0, k0;\n"
-    "__local %s tempC[%lu];\n"
-    "%s c[%u];\n"
-    "const int lid = get_local_id(0);\n"
-    "const int skew = lid %% %lu;\n"
-    "%s"                                    // groups per Panel variable
-    "uint blockN;\n"
-    "uint x, y, imx, imy;\n"
-    "uint2 coordA, coordB;\n"
-    "\n"
-    "const uint currN = get_global_id(0) / %u * %lu;\n"       // group ID
-    "\n";
-
-static const char *readRectBlock =
-    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
-    "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
-    "if ((y == %lu) && (x == %lu)) {\n"
-    // just read with an optimized function
-    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"
-    "}\n"
-    "else {\n"
-    "    %s((__local float4*)temp%c);\n"           // zeroing
-    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-    "    %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n"
-    "}\n\n";
-
-static const char *readRectBlockOpt =
-    // just read with an optimized function
-    "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n";
-
-static const char *readRectBlockTrans =
-    "y = (currN + %lu <= N) ? %lu : N - currN;\n"
-    "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n"
-    "if ((y == %lu) && (x == %lu)) {\n"
-    // read and transpose with an optimized function
-    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"
-    "}\n"
-    "else {\n"
-    "    %s((__local float4*)temp%c);\n"           // zeroing
-    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-    // read and transpose with slow function
-    "    %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n"
-    "}\n\n";
-
-static const char *readRectBlockTransOpt =
-    // read and transpose with an optimized function
-    "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n";
-
-static ssize_t
-wrapper(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra);
-
-static ssize_t
-generator(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra);
-
-static ssize_t
-prepGenerator(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static bool
-isFitToLDS(
-    SubproblemDim *dim,
-    DataType dtype,
-    cl_ulong ldsSize,
-    const void *kernelArgs);
-
-static void
-calcNrThreads(
-    size_t threads[2],
-    const SubproblemDim *dims,
-    const PGranularity *pgran,
-    const void *args,
-    const void *extra);
-
-
-static void
-imgPackMode(
-    const void *extra,
-    const SubproblemDim *dims,
-    int dataID,
-    unsigned int *packRate,
-    clblasOrder *packOrder);
-
-static SolverFlags
-solverFlags(void);
-
-static SolverOps solverOps = {
-    wrapper,
-    assignKargs,
-    isFitToLDS,
-    NULL,
-    NULL,
-    calcNrThreads,
-    imgPackMode,
-    solverFlags,
-    NULL, //fixupArgs
-    NULL, //getDefaultDecomp
-   	NULL, //getDecompList
-   	NULL,
-   	NULL
-};
-
-static CLBLASMpatExtra mpatExtra;
-
-/* Prepare A kernel begin */
-
-static const char *trsmPrepDecl =
-    "void __kernel\n"
-    "%ctrsmPrepare(\n"
-    "    uint %c,\n"
-    "    __global %s *A,\n"
-    "    uint lda,\n"
-    "    __write_only image2d_t imA,\n"
-    "    uint startRow,\n"
-    "    uint offA)\n";
-
-/*
- * template for memory object based trsm preparation part
- * for one dimensional work space
- */
-static const char *trsmPrep1D =
-    "__local %s tempA[%lu];\n"
-    "__local %s tempC[%lu];\n"
-    "int lid, gid;\n"
-    "uint currM, k0;\n"
-    "uint x, y, imx, imy;\n"
-    "\n"
-    "lid = get_local_id(0);\n"
-    "gid = get_global_id(0) / %u;\n"      // group ID
-    "A += offA;\n"
-    "\n";
-
-static const char *readSquareBlock =
-    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
-    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
-    "if ((y == %lu) && (x == %lu)) {\n"
-    // just read with an optimized function
-    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"
-    "}\n"
-    "else {\n"
-    "    %s((__local float4*)temp%c);\n"          // zeroing
-    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-    "    %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n"
-    "}\n\n";
-
-static const char *readSquareBlockOpt =
-    // just read with an optimized function
-    "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n";
-
-static const char *readSquareBlockTrans =
-    "y = (currM + %lu <= M) ? %lu : M - currM;\n"
-    "x = (k0 + %lu <= M) ? %lu : M - k0;\n"
-    "if ((y == %lu) && (x == %lu)) {\n"
-    // read and transpose with an optimized function
-    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"
-    "}\n"
-    "else {\n"
-    "    %s((__local float4*)temp%c);\n"          // zeroing
-    "    barrier(CLK_LOCAL_MEM_FENCE);\n"
-    // read and transpose with slow function
-    "    %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n"
-    "}\n\n";
-
-static const char *readSquareBlockTransOpt =
-    // read and transpose with an optimized function
-    "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n";
-
-
-static bool
-useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb)
-{
-    unsigned int vecLen;
-
-    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-
-    return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen)));
-}
-
-static size_t
-calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul)
-{
-    size_t ret;
-    size_t tsize;
-
-    tsize = dtypeSize(dtype);
-    ret = (transpMul) ? dim->x : dim->bwidth;
-    ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize;
-
-    return ret;
-}
-
-static void
-genPrepareSquareBlock(
-    struct KgenContext *ctx,
-    const SubproblemDim *dim,
-    DataType dtype,
-    const CopyBufFuncs *copyFuncs,
-    const ZeroFuncs *zeroFuncs,
-    bool tra,
-    char c,
-    bool opt)
-{
-    char tmp[1024];
-    size_t pitch;
-    const char *readBlock;
-
-    pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft);
-    if (opt) {
-        readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt;
-        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c);
-    }
-    else {
-        readBlock = (tra) ? readSquareBlockTrans : readSquareBlock;
-        sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth,
-                dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c,
-                zeroFuncs->names[MATRIX_A], c,
-                copyFuncs->readGeneric[MATRIX_A], c, pitch);
-    }
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepZeroBlockC(
-    struct KgenContext *ctx,
-    const ZeroFuncs *zeroFuncs)
-{
-    char tmp[1024];
-    sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]);
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-genWriteBlock(
-    struct KgenContext *ctx,
-    const SubproblemDim *dim,
-    const CopyBufFuncs *copyFuncs)
-{
-    char tmp[1024];
-
-    sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n",
-        copyFuncs->write, dim[0].y, dim[0].y, dim[0].y);
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer
-{
-    kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, "
-                                                "uint width, uint *y, "
-                                                "uint *x)\n");
-    kgenBeginFuncBody(ctx);
-    if (isU) {
-        //n from beginning
-        kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n");
-        kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - "
-                               "sqrt((2 * width + 1) *"
-                               "(2 * width + 1) - 8 * n)) / 2;\n");
-        kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n");
-    }
-    else {
-        //n from beginning
-        kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n");
-        kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n");
-        kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n");
-    }
-    kgenEndFuncBody(ctx);
-
-    kgenAddBlankLine(ctx);
-}
-
-static void
-genGetImagePos(
-    struct KgenContext *ctx,
-    const SubproblemDim *subdims,
-    DataType dtype,
-    const char *blockName,
-    bool tra) //n -> x,y image
-{
-    char tmp[1024];
-    const char *parName;
-    const char *op[2] = {"/", "%"};
-
-    parName = (tra) ? "bpc" : "bpr";
-
-    sprintf(tmp, "imy = %s %s %s * %lu;\n"
-                 "imx = (%s %s %s) * %lu;\n",
-            blockName, op[tra], parName, subdims[0].y,
-            blockName, op[1 - tra], parName,
-            subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4));
-    kgenAddStmt(ctx, tmp);
-}
-
-// global memory to image converter
-static ssize_t
-prepGenerator(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra)
-{
-    struct KgenContext *ctx;
-    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
-    char tmp[1024];
-    const char *typeName;
-    CopyBufFuncs copyFuncs;
-    ZeroFuncs zeroFuncs;
-    char fpref;
-    DataType dtype = kextra->dtype;
-    KernelExtraFlags kflags = kextra->flags;
-    ssize_t ret;
-    size_t pitchAB;
-    bool b;
-    bool tra, trb, isU, transpMul;
-    BlasGenSettings gset;
-
-    if (pgran->wgDim != 1) {
-        return -EINVAL;
-    }
-
-    ctx = createKgenContext(buf, buflen, true);
-    if (ctx == NULL) {
-        return -ENOMEM;
-    }
-
-    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
-    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
-    isU = isMatrixUpper(kflags);
-
-    // at first, generate needed declarations and auxiliary functions
-
-    b = isDoubleBasedType(dtype);
-    kgenDeclareUptrs(ctx, b);
-
-    if (isComplexType(dtype)) {
-        genComplexMathOperators(ctx, dtype);
-    }
-
-    memset(&gset, 0, sizeof(gset));
-    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
-    gset.kextra = kextra;
-    gset.pgran = pgran;
-
-    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset,
-                         BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE);
-    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
-                         ZF_MATRIX_A);
-
-    //matrix inversion function
-    genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU);
-
-    //coordinates calculation
-    getBufferPos(ctx, isU);
-
-    typeName = dtypeBuiltinType(dtype);
-    fpref = dtypeToBlasPrefix(dtype);
-
-    // now, generate the kernel
-
-    sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName,
-        typeName, typeName, typeName);
-
-    kgenDeclareFunction(ctx, tmp);
-    ret = kgenBeginFuncBody(ctx);
-
-    transpMul = useTransposedMul(subdims, dtype, trb);
-    if (!transpMul) {
-        sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n",
-                subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)));
-    }
-    else {
-        sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n",
-                subdims[0].y);
-    }
-    kgenAddStmt(ctx, tmp);
-
-    /*
-     * Calculate local buffer pitches, and then insert the
-     * preparative code
-     */
-    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
-    sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y,
-            typeName, pitchAB * subdims[0].y, pgran->wgSize[0]);
-    ret = kgenAddStmt(ctx, tmp);
-
-    sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n",
-            subdims[0].y, subdims[0].y - 1, subdims[0].y);
-    kgenAddStmt(ctx, tmp);
-    sprintf(tmp, "currM *= %lu;\n"
-            "k0 *= %lu;\n", subdims[0].y, subdims[0].y);
-    kgenAddStmt(ctx, tmp);
-
-    genGetImagePos(ctx, subdims, dtype, "gid", transpMul);
-
-    kgenBeginBranch(ctx, "if (currM == k0)");
-    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
-                          tra, 'A', !(kextra->flags & KEXTRA_TAILS_M));
-    genPrepZeroBlockC(ctx, &zeroFuncs);
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
-    if (kextra->flags & KEXTRA_UNIT_DIAGONAL) {
-        sprintf(tmp, "if (lid < %lu) {\n"
-                     "    tempA[lid * %lu + lid] = %s;\n"
-                     "}\n",
-                subdims[0].bwidth, pitchAB, strOne(dtype));
-        kgenAddStmt(ctx, tmp);
-        kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-        kgenAddBlankLine(ctx);
-    }
-
-    sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth);
-    kgenBeginBranch(ctx, tmp);
-    sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? "
-                                            "M - currM : %lu);\n",
-            subdims[0].y, subdims[0].y);
-    kgenAddStmt(ctx, tmp);
-    kgenEndBranch(ctx, NULL);
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-    kgenEndBranch(ctx, NULL);
-
-    kgenBeginBranch(ctx, "else");
-    genPrepareSquareBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs, tra,
-                          'C', !(kextra->flags & KEXTRA_TAILS_M));
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-    kgenEndBranch(ctx, NULL);
-
-    genWriteBlock(ctx, subdims, &copyFuncs);
-    kgenEndFuncBody(ctx);
-    ret = kgenAddBlankLine(ctx);
-
-    if (!ret) {
-        ret = (ssize_t)kgenSourceSize(ctx) + 1;
-    }
-
-    destroyKgenContext(ctx);
-
-    return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static void
-genZeroResult(
-    struct KgenContext *ctx,
-    DataType dtype,
-    const SubproblemDim *dims)
-{
-    unsigned int n;
-    char tmp[1024];
-    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-
-    getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL);
-
-    sprintf(tmp, "for (x = 0; x < %u; x++) {\n"
-                 "    c[x] = 0;\n"
-                 "}\n\n", n);
-
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-genPrepareRectBlock(
-    struct KgenContext *ctx,
-    const SubproblemDim *dim,
-    DataType dtype,
-    const CopyBufFuncs *copyFuncs,
-    const ZeroFuncs *zeroFuncs,
-    bool trb,
-    char c,
-    bool opt)
-{
-    char tmp[1024];
-    size_t pitch;
-    const char *readBlock;
-    size_t bsizes[2] = {dim->bwidth, dim->x};
-
-    /*
-     * NOTE: in case of accessing to B in the non transposed way
-     *       block multiplication is done with transposed block B
-     */
-    pitch = calcPitchB(dim, dtype, !trb);
-    if (opt) {
-        readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt;
-        sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c);
-    }
-    else {
-        readBlock = (trb) ? readRectBlockTrans : readRectBlock;
-        sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb],
-                bsizes[1 - trb], bsizes[trb], bsizes[1 - trb],
-                copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c,
-                copyFuncs->readGeneric[MATRIX_B], c, pitch);
-    }
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-getNblock(struct KgenContext *ctx, bool isU) //x, y -> n
-{
-    kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, "
-        "uint width, uint *n)\n");
-    kgenBeginFuncBody(ctx);
-    if (isU) {
-        kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - "
-            "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n");
-    }
-    else {
-        kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n");
-    }
-    kgenEndFuncBody(ctx);
-    kgenAddBlankLine(ctx);
-}
-
-static void
-genMultiplication(
-    struct KgenContext *ctx,
-    const SubproblemDim *dims,
-    DataType dtype,
-    const char *blkmulName,
-    BlkMulFlags mulFlags)
-{
-    char tmp[1024];
-    size_t u;
-    unsigned int l1Pans;
-
-    l1Pans = (unsigned int)(dims[0].x / dims[1].x);
-    if (mulFlags & BLKMUL_TRANSPOSED_B) {
-        u = 1;
-    }
-    else {
-        u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft);
-    }
-
-    // find image position and invoke the multiplier
-    sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, "
-                           "(M + %lu) / %lu, &blockN);\n",
-            dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y);
-    kgenAddStmt(ctx, tmp);
-    genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0);
-    sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n"
-                  "   (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n"
-                  "   c, skew);\n",
-            blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u);
-    kgenAddStmt(ctx, tmp);
-}
-
-static void
-genReorderSolution(
-    struct KgenContext *ctx,
-    const SubproblemDim *subdims,
-    const char *outTypeName,
-    unsigned int colRegs)
-{
-    char tmp[1024], tmp1[1024];
-    char *p;
-    unsigned i;
-
-    sprintf(tmp, "void\n"
-                 "reorderResult(%s *c, int skew)",
-            outTypeName);
-    kgenDeclareFunction(ctx, tmp);
-    kgenBeginFuncBody(ctx);
-
-    sprintf(tmp, "%s tmp;\n"
-                 "int i, j;\n",
-           outTypeName);
-    kgenAddStmt(ctx, tmp);
-
-    p = tmp1;
-    for (i = 0; i < colRegs; i++) {
-        unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i;
-
-        sprintf(p,  "\n"
-                    "    tmp = c[%u];\n"
-                    "    for (j = %lu; j >= 0; j--) {\n"
-                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
-                    "    }\n"
-                    "    c[%u] = tmp;\n",
-                k, subdims[1].y - 2, colRegs, i, colRegs, i, i);
-        p += strlen(p);
-    }
-
-    sprintf(tmp, "\n"
-                 "for (i = 0; i < skew; i++) {\n"
-                 "%s"
-                 "}\n"
-                 "\n",
-            tmp1);
-    kgenAddStmt(ctx, tmp);
-
-    kgenEndFuncBody(ctx);
-    kgenAddBlankLine(ctx);
-}
-
-static void
-initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags)
-{
-    kvars->A = "imgA";
-    kvars->B = "B";
-
-    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
-        kvars->coordA = "coordA.x";
-    }
-    else {
-        kvars->coordA = "coordA.y";
-    }
-    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
-        kvars->coordB = "coordB.x";
-    }
-    else {
-        kvars->coordB = "coordB.y";
-    }
-
-    kvars->sizeM = "M";
-    kvars->sizeN = "N";
-    kvars->sizeK = "origM";
-}
-
-// image based kernel generator
-static ssize_t
-generator(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra)
-{
-    struct KgenContext *ctx;
-    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
-    CLBLASKernExtra kextraTmp = *kextra;
-    char tmp[1024], tmp1[1024];
-    char blkmul[FUNC_NAME_MAXLEN];
-    char updateResFn[FUNC_NAME_MAXLEN];
-    char updateResGenericFn[FUNC_NAME_MAXLEN];
-    char updateResFnRev[FUNC_NAME_MAXLEN];
-    char updateResGenericFnRev[FUNC_NAME_MAXLEN];
-    char copyPLFn[FUNC_NAME_MAXLEN];
-    char *s1 = "";
-    const char *typeName;
-    CopyBufFuncs copyFuncs;
-    ZeroFuncs zeroFuncs;
-    char fpref;
-    DataType dtype = kextra->dtype;
-    ssize_t ret;
-    BlasGenSettings gset;
-    BlkMulOpts mulOpts;
-    BlkMulFlags mulFlags;
-    size_t pitchAB;
-    size_t u;
-    bool b;
-    bool isU;
-    bool areTails;
-    const char *outTypeName;
-    unsigned int nrRegs, colRegs;
-    KernelExtraFlags kflags = kextra->flags;
-    size_t tsize;
-    unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype);
-    UpdateResultFlags upFlags;
-    int tra, trb;
-    unsigned int l1Pans;
-    char vect[2] = {'y', 'x'};
-
-    if (pgran->wgDim != 1) {
-        return -EINVAL;
-    }
-
-    ctx = createKgenContext(buf, buflen, true);
-    if (ctx == NULL) {
-        return -ENOMEM;
-    }
-
-    tsize = dtypeSize(dtype);
-    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
-    isU = isMatrixUpper(kflags);
-
-    tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
-    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);
-    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;
-
-    /*
-     * Force generation of the transposed version of the block
-     * reading function with following multiplication with transposed
-     * block B to decrease LDS bank conflicts without column skew using.
-     * Reverse temporarily the flag of the column-major order for that
-     */
-    if (useTransposedMul(subdims, dtype, trb)) {
-        if (kflags & KEXTRA_COLUMN_MAJOR) {
-            kflags &= ~KEXTRA_COLUMN_MAJOR;
-        }
-        else {
-            kflags |= KEXTRA_COLUMN_MAJOR;
-        }
-        mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B;
-        u = subdims[1].y;
-    }
-    else {
-        mulFlags = BLKMUL_SKEW_COLUMN;
-        u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype));
-    }
-
-    ctx = createKgenContext(buf, buflen, true);
-    if (ctx == NULL) {
-        return -ENOMEM;
-    }
-
-    // at first, generate needed declarations and auxiliary functions
-
-    b = isDoubleBasedType(dtype);
-    kgenDeclareUptrs(ctx, b);
-
-    kextraTmp.flags = kflags;
-    memset(&gset, 0, sizeof(gset));
-    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
-    gset.kextra = &kextraTmp;
-    gset.pgran = pgran;
-    initKernelVarNames(&gset.varNames, kextra->flags);
-
-    if (isComplexType(dtype)) {
-        genComplexMathOperators(ctx, dtype);
-    }
-
-    generateBufCopyFuncs(&copyFuncs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B);
-    /*
-     * Temporary kernel extra has been needed to produce inverted block B read.
-     * Restore the original one, and restore kflags as well
-     */
-    gset.kextra = kextra;
-    kflags = kextra->flags;
-
-    // functions updating result
-    // for the final result
-    generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn,
-                       updateResGenericFn);
-    // for intermediate result after blocks modification
-    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
-    upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST;
-    genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev,
-                           updateResGenericFnRev);
-    // for heaping before multiplying on inverted block
-    upFlags = UPRES_USE_LDS;
-    if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
-        upFlags |= UPRES_COLUMN_MAJOR;
-    }
-    updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL);
-    kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx);
-    kgenAddBlankLine(ctx);
-
-    generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype,
-                         ZF_MATRIX_B | ZF_MATRIX_C);
-
-    // block multiplication function
-    mulOpts.aMobj = CLMEM_IMAGE;
-    mulOpts.bMobj = CLMEM_BUFFER;
-    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags;
-    if (isComplexType(dtype)) {
-        mulOpts.core = BLKMUL_SEPARATE_MULADD;
-    }
-    else {
-        mulOpts.core = BLKMUL_MAD;
-    }
-    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
-    if (ret) {
-        destroyKgenContext(ctx);
-
-        return -EOVERFLOW;
-    }
-
-    kgenAddBlankLine(ctx);
-    kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx);
-
-    typeName = dtypeBuiltinType(dtype);
-    fpref = dtypeToBlasPrefix(dtype);
-
-    // block number calculation
-    getNblock(ctx, isU);
-
-    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);
-    if (isComplexType(dtype)) {
-        colRegs = (unsigned int)subdims[1].x;
-    }
-    else {
-        colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize);
-    }
-
-    if (mulFlags & BLKMUL_SKEW_ROW) {
-        genReorderSolution(ctx, subdims, outTypeName, colRegs);
-    }
-
-    // now, generate the kernel
-
-    if (kflags & KEXTRA_SIDE_RIGHT) {
-        sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
-            fpref, 'N', 'M', typeName, typeName, typeName, typeName);
-    }
-    else {
-        sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1],
-            fpref, 'M', 'N', typeName, typeName, typeName, typeName);
-    }
-
-    kgenDeclareFunction(ctx, tmp);
-    ret = kgenBeginFuncBody(ctx);
-
-    if (!(mulFlags & BLKMUL_TRANSPOSED_B)) {
-        sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n",
-                subdims[0].y / (sizeof(cl_float4) / tsize));
-    }
-    else {
-        sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n",
-                subdims[0].y);
-    }
-    kgenAddStmt(ctx, tmp);
-
-    /*
-     * Calculate local buffer pitches, and then insert the
-     * preparative code
-     */
-    pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft);
-
-    sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x,
-        outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX);
-    kgenAddStmt(ctx, tmp);
-    kgenAddBlankLine(ctx);
-
-    kgenAddStmt(ctx, "B += offB;\n");
-    sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n"
-                 "coordB.%c = 0;\n\n",
-            vect[trb], l1Pans, subdims[1].x, vect[1 - trb]);
-    kgenAddStmt(ctx, tmp);
-
-   /*
-    * B matrix is divided on panels, each work group
-    * multiply such a panel on the whole matrix A.
-    */
-
-    // top level loop over M
-    if (isU) {
-        sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y,
-                subdims[0].y); //last block start
-        sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)",
-                tmp1, subdims[0].y, subdims[0].y);
-        ret = kgenBeginBranch(ctx, tmp);
-    }
-    else {
-        sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)",
-                subdims[0].y);
-        ret = kgenBeginBranch(ctx, tmp);
-    }
-
-    sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n"
-                 "coordA.%c = 0;\n\n",
-            vect[tra], l1Pans, subdims[1].y, vect[1 - tra]);
-    kgenAddStmt(ctx, tmp);
-
-    genZeroResult(ctx, dtype, subdims);
-
-    // loop over K
-    if (isU) {
-        sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)",
-            subdims[0].bwidth, subdims[0].bwidth);
-    }
-    else {
-        sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)",
-            subdims[0].bwidth);
-    }
-    ret = kgenBeginBranch(ctx, tmp);
-
-    genPrepareRectBlock(ctx, subdims, dtype, &copyFuncs, &zeroFuncs,
-                        trb, 'C', !areTails);
-
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-
-    // multiplication in the adjusting loop
-    genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
-
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-    kgenEndBranch(ctx, NULL); // loop over K
-    kgenAddBlankLine(ctx);
-
-    if (mulFlags & BLKMUL_SKEW_ROW) {
-        kgenAddStmt(ctx, "reorderResult(c, skew);\n");
-    }
-    kgenAddStmt(ctx, "k0 = m0;\n");
-
-    genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev,
-                                  updateResGenericFnRev, true);
-
-    genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC");
-    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
-    genZeroResult(ctx, dtype, subdims);
-
-    // multiplication on the inverted block
-    genMultiplication(ctx, subdims, dtype, blkmul, mulFlags);
-    if (mulFlags & BLKMUL_SKEW_ROW) {
-        kgenAddStmt(ctx, "reorderResult(c, skew);\n");
-    }
-
-    // write back the tile evaluated
-    upFlags = UPRES_EXCEED_PROBLEM_CONDITION;
-    if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) {
-        upFlags |= UPRES_COLUMN_MAJOR;
-    }
-    genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn,
-                                updateResGenericFn, NULL);
-
-    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);
-
-    // end external loops over panels of matrix A
-    kgenEndBranch(ctx, NULL);
-    kgenEndFuncBody(ctx);
-    ret = kgenAddBlankLine(ctx);
-
-    if (!ret) {
-        ret = (ssize_t)kgenSourceSize(ctx) + 1;
-    }
-
-    destroyKgenContext(ctx);
-
-    return (ret < 0) ? -EOVERFLOW : ret;
-}
-
-static ssize_t
-wrapper(
-    char *buf,
-    size_t buflen,
-    const struct SubproblemDim *subdims,
-    const struct PGranularity *pgran,
-    void *extra)
-{
-    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
-    if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) {
-        return generator(buf, buflen, subdims, pgran, extra);
-    }
-    else {
-        return prepGenerator(buf, buflen, subdims, pgran, extra);
-    }
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra)
-{
-    const CLBlasKargs *blasArgs = (const CLBlasKargs*)params;
-
-    (void)extra;
-
-    if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) {
-        if (blasArgs->side == clblasLeft) {
-           initSizeKarg(&args[0], blasArgs->K);
-           initSizeKarg(&args[1], blasArgs->N);
-        }
-        else {
-           initSizeKarg(&args[0], blasArgs->M);
-           initSizeKarg(&args[1], blasArgs->K);
-        }
-        assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype);
-        initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
-        initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0);
-        initSizeKarg(&args[5], blasArgs->ldb.matrix);
-        if (blasArgs->side == clblasLeft) {
-            initSizeKarg(&args[6], blasArgs->offsetM);
-            initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM);
-        }
-        else {
-            initSizeKarg(&args[6], blasArgs->offsetN);
-            initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN);
-        }
-        initSizeKarg(&args[8], blasArgs->offBX);
-    }
-    else {
-        if (blasArgs->side == clblasLeft) {
-            initSizeKarg(&args[0], blasArgs->M);
-        }
-        else {
-            initSizeKarg(&args[0], blasArgs->N);
-        }
-        initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0);
-        initSizeKarg(&args[2], blasArgs->lda.matrix);
-        initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0);
-        if (blasArgs->side == clblasLeft) {
-            initSizeKarg(&args[4], blasArgs->offsetM);
-        }
-        else {
-            initSizeKarg(&args[4], blasArgs->offsetN);
-        }
-        initSizeKarg(&args[5], blasArgs->offA);
-    }
-}
-
-static bool
-isFitToLDS(
-    SubproblemDim *dim,
-    DataType dtype,
-    cl_ulong ldsSize,
-    const void *kernelArgs)
-{
-    cl_ulong sizeA, sizeB, size;
-    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
-
-    /*
-     * For prepare kernel two square local blocks required.
-     * For main kernel two rectangular blocks required.
-     * Maximum of these two values checked.
-     */
-
-    sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side);
-    sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side);
-    size = (sizeA > sizeB) ? sizeA : sizeB;
-
-    return (2 * size * dtypeSize(dtype) <= ldsSize);
-}
-
-static void
-calcNrThreads(
-    size_t threads[2],
-    const SubproblemDim *dims,
-    const PGranularity *pgran,
-    const void *args,
-    const void *extra)
-{
-    SubproblemDim globDim, offDim;
-    const CLBlasKargs *kargs = (const CLBlasKargs*)args;
-    size_t width, startBlock, finishBlock;
-    bool isU = (kargs->uplo == clblasUpper) ^
-        (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight);
-
-    (void)extra;
-
-    width = kargs->K;
-    width = (width + dims[0].bwidth - 1) / dims[0].bwidth;
-    kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false);
-    kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true);
-
-    startBlock = offDim.y / dims[0].bwidth;
-    finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth;
-
-    if (kargs->kernType == CLBLAS_PREP_A_KERNEL) {
-        if (isU) {
-            threads[0] = ((2 * width - startBlock - finishBlock + 1) *
-                (finishBlock - startBlock) / 2) * pgran->wgSize[0];
-        }
-        else {
-            threads[0] = ((1 + finishBlock + startBlock) *
-                (finishBlock - startBlock) / 2) * pgran->wgSize[0];
-        }
-        threads[1] = 0;
-    }
-    else {
-        calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x);
-    }
-}
-
-static void
-imgPackMode(
-    const void *extra,
-    const SubproblemDim *dims,
-    int dataID,
-    unsigned int *packRate,
-    clblasOrder *packOrder)
-{
-    bool trb;
-    const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
-
-    (void)dataID;
-
-    trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B);
-    if (trb || isComplexType(kextra->dtype)) {
-        *packOrder = clblasRowMajor;
-        *packRate = (unsigned int)dims[0].y;
-    }
-    else {
-        *packOrder = clblasColumnMajor;
-        *packRate = (unsigned int)dims[0].y;
-    }
-}
-
-static SolverFlags
-solverFlags(void)
-{
-    return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS);
-}
-
-void
-initTrsmImgPattern(MemoryPattern *mempat)
-{
-    mempat->name = "Image based block trsm";
-    mempat->nrLevels = 2;
-    mempat->cuLevel = 0;
-    mempat->thLevel = 1;
-    mempat->sops = &solverOps;
-    mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS;
-    mpatExtra.bMset = CLMEM_LEVEL_LDS;
-    mpatExtra.mobjA = CLMEM_IMAGE;
-    mpatExtra.mobjB = CLMEM_BUFFER;
-    mempat->extra = &mpatExtra;
-}