diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/legacy/trmm_lds.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/legacy/trmm_lds.c | 514 |
1 files changed, 0 insertions, 514 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/trmm_lds.c b/external/clBLAS/src/library/blas/gens/legacy/trmm_lds.c deleted file mode 100644 index d7fe8826..00000000 --- a/external/clBLAS/src/library/blas/gens/legacy/trmm_lds.c +++ /dev/null @@ -1,514 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -/* - * LDS based generator - */ - -#include <string.h> -#include <stdio.h> -#include <assert.h> - -#include <clBLAS.h> -#include <blas_mempat.h> -#include <clkern.h> -#include <clblas-internal.h> -#include <matrix_dims.h> -#include <dis_warning.h> - -#include "../init.h" -#include "blas_kgen_legacy.h" -#include "gen_helper_legacy.h" -#include "../gen_helper.h" -#include "../trxm_common.h" -#include "trxm_common_legacy.h" - -static CLBLASMpatExtra mpatExtra; - -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra); - -static void -assignKargs(KernelArg *args, const void *params, const void *extra); - -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs); - -static SolverFlags -solverFlags(void); - -static int -getPerf( unsigned int kflags, - const void *args); - -static SolverOps solverOps = { - generator, - assignKargs, - isFitToLDS, - getPerf, - NULL, - NULL, - NULL, - solverFlags, - NULL, //fixupKargs - NULL, //getDefaultDecomp - NULL, //getDecompList - NULL, - NULL -}; - -static void -genPrepareBlockC( - struct KgenContext *ctx, - const ZeroFuncs *zeroFuncs) -{ - char tmp[2048]; - - sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_C]); - kgenAddStmt(ctx, tmp); -} - -static void -genWriteBlockB( - struct KgenContext *ctx, - const SubproblemDim *dim, - DataType dtype, - const CopyBufFuncs *copyFuncs, - KernelExtraFlags kflags) -{ - char tmp[1024]; - size_t pitch; - const char *coordName[2] = {"currM", "currN"}; - int trb; - - trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_C); - pitch = matrBlockPitch(dim, MATRIX_C, dtype, clblasLeft); - - if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) { - sprintf(tmp, "%s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n", - copyFuncs->write, coordName[trb], coordName[1 - trb]); - } - else { - sprintf(tmp, - "y = (currM + %lu <= M) ? %lu : M - currM;\n" - "x = (currN + %lu <= N) ? %lu : N - currN;\n" - "if ((y == %lu) && (x == %lu)) {\n" - // fast rwrite - " %s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n" - "}\n" - "else {\n" - // slow write - " %s((GPtr)B, (LPtr)tempC, %s, %s, y, x, ldb, %lu);\n" - "}\n\n", - dim->y, dim->y, dim->x, dim->x, dim->y, dim->x, - copyFuncs->write, coordName[trb], coordName[1 - trb], - copyFuncs->writeGeneric, coordName[trb], - coordName[1 - trb], pitch); - } - - kgenAddStmt(ctx, tmp); -} - -static void -genInitCurrM( - struct KgenContext *ctx, - const SubproblemDim *dim, - KernelExtraFlags kflags) -{ - char tmp[1024]; - - if (isMatrixUpper(kflags)) { - strcpy(tmp, "currM = 0;\n"); - } - else { - sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y); - } - - kgenAddStmt(ctx, tmp); - kgenAddBlankLine(ctx); -} - -static void -genInternalLoopCtl( - struct KgenContext *ctx, - const SubproblemDim *dim, - KernelExtraFlags kflags) -{ - char tmp[1024]; - - if (isMatrixUpper(kflags)) { - if (!(kflags & KEXTRA_TAILS_M)) { - sprintf(tmp, "for (k0 = M - %lu; (k0 + %lu > currM) && (k0 < M); " - "k0 -= %lu)", - dim->bwidth, dim->bwidth, dim->bwidth); - } - else { - sprintf(tmp, "for (k0 = (M - 1) / %lu * %lu; k0 + %lu > currM; " - "k0 -= %lu)", - dim->bwidth, dim->bwidth, dim->bwidth, dim->bwidth); - } - } - else { - sprintf(tmp, "for (k0 = 0; (k0 < currM + %lu) && (k0 < M); " - "k0 += %lu)", - dim->y, dim->bwidth); - } - - kgenBeginBranch(ctx, tmp); -} - -static void -initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) -{ - kvars->A = "A"; - kvars->B = "B"; - if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) { - kvars->coordA = "coordA.x"; - } - else { - kvars->coordA = "coordA.y"; - } - if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) { - kvars->coordB = "coordB.x"; - } - else { - kvars->coordB = "coordB.y"; - } - kvars->sizeM = "M"; - kvars->sizeN = "N"; - kvars->sizeK = "origM"; -} - -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - struct KgenContext *ctx; - CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; - char tmp[2048]; - char blkmul[128]; - char updateResFn[FUNC_NAME_MAXLEN]; - char updateResGenericFn[FUNC_NAME_MAXLEN]; - CopyBufFuncs copyFuncs; - ZeroFuncs zeroFuncs; - DataType dtype = kextra->dtype; - ssize_t ret; - BlasGenSettings gset; - BlkMulOpts mulOpts; - size_t pitchAB, pitchC; - bool b; - KernelExtraFlags kflags = kextra->flags; - const char *outTypeName; - unsigned int nrRegs; - bool useLocalC; - unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); - int tra, trb; - unsigned int l1Pans; - char vect[2] = {'y', 'x'}; - - if (pgran->wgDim != 1) { - return -EINVAL; - } - - ctx = createKgenContext(buf, buflen, true); - if (ctx == NULL) { - return -ENOMEM; - } - - /* Code that updates block of B matrix using local registers or use mad's - * doesn't work on some GPUs. As a workaround use buffer in local memory - * for unaligned matrix sizes */ - useLocalC = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); - - memset(&gset, 0, sizeof(gset)); - memcpy(gset.subdims, subdims, sizeof(gset.subdims)); - gset.pgran = pgran; - gset.kextra = kextra; - - initKernelVarNames(&gset.varNames, kflags); - - // at first, generate needed declarations and auxiliary functions - - b = isDoubleBasedType(dtype); - kgenDeclareUptrs(ctx, b); - generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRMM, &gset, - BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT); - if (useLocalC) { - generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, - ZF_MATRIX_A | ZF_MATRIX_B | ZF_MATRIX_C); - } - else { - generateUpresFuncs(ctx, CLBLAS_TRMM, &gset, updateResFn, - updateResGenericFn); - generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, - ZF_MATRIX_A | ZF_MATRIX_B); - } - kgenAddBlankLine(ctx); - - // block multiplication function - mulOpts.aMobj = CLMEM_BUFFER; - mulOpts.bMobj = CLMEM_BUFFER; - - if (useLocalC) { - mulOpts.flags = BLKMUL_SKEW_COLUMN; - } - else { - mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_COLUMN; - } - // BLKMUL_MAD doesn't work here on all cards so use SEPARATE_MULADD always - // as a workaround - mulOpts.core = BLKMUL_SEPARATE_MULADD; - ret = blkMulGen(ctx, subdims, dtype, &mulOpts); - if (ret) { - destroyKgenContext(ctx); - return -EOVERFLOW; - } - - kgenAddBlankLine(ctx); - kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx); - - // now, generate the kernel - declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, NULL, false, - false); - ret = kgenBeginFuncBody(ctx); - - /* - * Calculate local buffer pitches, and then insert the - * preparative code - */ - pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); - pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft); - - getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); - declareLdsBasedTrxmVariables(ctx, dtype, subdims, pgran, useLocalC); - - /* - * B matrix is divided on panels, each work group - * multiply such a panel on the whole matrix A. - */ - sprintf(tmp, "currN = gid * %lu;\n", subdims->x); - kgenAddStmt(ctx, tmp); - genInitCurrM(ctx, subdims, kflags); - if (((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTM_NOT_ZERO)) == - KEXTRA_STARTM_NOT_ZERO) || - ((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO)) == - (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO))) { - - kgenAddStmt(ctx, "A += lda * offsetM + offsetM;\n"); - } - if (kflags & KEXTRA_A_OFF_NOT_ZERO) { - kgenAddStmt(ctx, "A += offA;\n"); - } - genTrxmBMatrShift(ctx, kflags, false); - - tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); - trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); - l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; - - sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n" - "coordB.%c = 0;\n\n", - vect[trb], l1Pans, subdims[1].x, vect[1 - trb]); - kgenAddStmt(ctx, tmp); - - // loop over M - sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y); - kgenBeginBranch(ctx, tmp); - - sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n" - "coordA.%c = 0;\n\n", - vect[tra], l1Pans, subdims[1].y, vect[1 - tra]); - kgenAddStmt(ctx, tmp); - - if (useLocalC) { - genPrepareBlockC(ctx, &zeroFuncs); - } - else { - // zero work item C block - sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" - " c[k0] = 0;\n" - "}\n\n", nrRegs); - kgenAddStmt(ctx, tmp); - } - - /* - * In the first pass the part without triangle blocks is processed, - * and in the second one only triangle blocks are processed - */ - genInternalLoopCtl(ctx, subdims, kflags); - - genPrepareTrxmBlockA(ctx, subdims, dtype, ©Funcs, &zeroFuncs, - kflags, "M"); - genPrepareTrxmBlockB(ctx, subdims, dtype, ©Funcs, &zeroFuncs, - kflags); - kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); - kgenAddBlankLine(ctx); - - genTriangMatrBlock(ctx, subdims, dtype, kflags); - - // and eventually multiply the blocks and update the matrix C block - if (useLocalC) { - sprintf(tmp, "%s(alpha, (LPtr)(tempA + (lid / %u * %lu) * %lu), \n" - " (LPtr)(tempB + (lid %% %u * %lu) * %lu),\n" - " (LPtr)(tempC + (lid / %u * %lu) * %lu + \n" - " (lid %% %u * %lu)), lid);\n", - blkmul, l1Pans, subdims[1].y, pitchAB, - l1Pans, subdims[1].x, pitchAB, - l1Pans, subdims[1].y, pitchC, l1Pans, subdims[1].x); - } - else { - sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), " - "(LPtr)(tempB + (lid %% %u * %lu) * %lu), c, lid);\n", - blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, - subdims[1].x, pitchAB); - } - kgenAddStmt(ctx, tmp); - kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); - - genInternalLoopEnd(ctx); // loop over K - kgenAddBlankLine(ctx); - - // write back the block, it's evaluated - if (useLocalC) { - genWriteBlockB(ctx, subdims, dtype, ©Funcs, kflags); - kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); - } - else { - if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { - sprintf(tmp, "if ((coordA.%c < M) && (coordB.%c < N))", - vect[tra], vect[trb]); - kgenBeginBranch(ctx, tmp); - } - - generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, updateResFn, - updateResGenericFn); - - if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { - kgenEndBranch(ctx, tmp); - } - } - - if (isMatrixUpper(kflags)) { - sprintf(tmp, "currM += %lu;\n", subdims[0].y); - } - else { - sprintf(tmp, "currM -= %lu;\n", subdims[0].y); - } - kgenAddStmt(ctx, tmp); - - kgenEndBranch(ctx, NULL); // loop over M - - kgenEndFuncBody(ctx); - ret = kgenAddBlankLine(ctx); - - if (!ret) { - ret = (ssize_t)kgenSourceSize(ctx) + 1; - } - - destroyKgenContext(ctx); - - return (ret < 0) ? -EOVERFLOW : ret; -} - -static void -assignKargs(KernelArg *args, const void *params, const void *extra) -{ - const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; - KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; - int idx = 7; - - initSizeKarg(&args[0], blasArgs->M); - initSizeKarg(&args[1], blasArgs->N); - assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); - initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); - initSizeKarg(&args[4], blasArgs->lda.matrix); - initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); - initSizeKarg(&args[6], blasArgs->ldb.matrix); - if (kflags & KEXTRA_STARTM_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offsetM); - } - if (kflags & KEXTRA_STARTN_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offsetN); - } - if (kflags & KEXTRA_A_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offA); - } - if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offBX); - } -} - -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs) -{ - const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; - cl_ulong size; - - size = matrBlockSize(dim, MATRIX_A, dtype, kargs->side); - size += matrBlockSize(dim, MATRIX_B, dtype, kargs->side); - size += matrBlockSize(dim, MATRIX_C, dtype, kargs->side); - - return (size * dtypeSize(dtype) <= ldsSize); -} - -static SolverFlags -solverFlags(void) -{ - return ((unsigned int)SF_WSPACE_1D); -} - -void -initTrmmLdsPattern(MemoryPattern *mempat) -{ - mempat->name = "LDS based block trmm"; - mempat->nrLevels = 2; - mempat->cuLevel = 0; - mempat->thLevel = 1; - mempat->sops = &solverOps; - - mpatExtra.aMset = CLMEM_LEVEL_LDS; - mpatExtra.bMset = CLMEM_LEVEL_LDS; - mpatExtra.mobjA = CLMEM_BUFFER; - mpatExtra.mobjB = CLMEM_BUFFER; - mempat->extra = &mpatExtra; -} - -static int -getPerf( unsigned int kflags, - const void *args) -{ - DUMMY_ARG_USAGE(kflags); - DUMMY_ARG_USAGE(args); - - return PPERF_POOR; -} |