diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/gemm.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/gemm.c | 1454 |
1 files changed, 0 insertions, 1454 deletions
diff --git a/external/clBLAS/src/library/blas/gens/gemm.c b/external/clBLAS/src/library/blas/gens/gemm.c deleted file mode 100644 index efa2375f..00000000 --- a/external/clBLAS/src/library/blas/gens/gemm.c +++ /dev/null @@ -1,1454 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -/* - * Cached global buffers based gemm generator - */ - -#include <string.h> -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> - -#include <clblas_stddef.h> -#include <clBLAS.h> -#include <blas_mempat.h> -#include <clkern.h> -#include <clblas-internal.h> - -#include "blas_kgen.h" -#include "blas_subgroup.h" -#include "gen_helper.h" - -typedef struct { - size_t staggered; -} MAY_ALIAS extraData_t; - -static CLBLASMpatExtra mpatExtra; - -static ssize_t -blockGen( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra); - -static ssize_t -subgGen( - char *pBuf, - size_t buflen, - const struct SubproblemDim *pSubDims, - const struct PGranularity *pPGran, - void *pExtra ); - -static void -assignBlockKargs( - KernelArg *args, - const void *params, - const void *extra); - -static bool -blockCheckCalcDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - DataType dtype, - int check); - -static int -blockGetPerf( - unsigned int kflags, - const void *args); - -static void -assignSubgKargs( - KernelArg *args, - const void *params, - const void *extra); - -static SolverFlags -solverFlags(void); - -static DecompositionAxis -innerDecompositionAxis(const void *args); - -static int -gemmSubgGetDefaultDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - void * pArgs); - -static bool -subgCheckCalcDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - DataType dtype, - int check); - -static void -subgCalcGlobalThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *extra -); - -static int -subgGetPerf( - unsigned int kflags, - const void *args); - -static void -fixupArgs(void *args, SubproblemDim *subdims, void *extra); - -static SolverOps blockSOps = { - blockGen, - assignBlockKargs, - NULL, - blockGetPerf, - innerDecompositionAxis, - NULL, - NULL, - solverFlags, - NULL,// fixup kargs - NULL, //blockGetDefaultDecomp, - blockCheckCalcDecomp, - NULL, - NULL -}; - -static SolverOps subgSOps = { - subgGen, - assignSubgKargs, - NULL, - subgGetPerf, - innerDecompositionAxis, - subgCalcGlobalThreads, - NULL, - solverFlags, - fixupArgs,// fixup kargs - gemmSubgGetDefaultDecomp, - subgCheckCalcDecomp, - NULL, - NULL -}; - -//***************************************************************************** -//----------------------------------------------------------------------------- - -static void -genSetupItemPtr( - struct KgenContext *ctx, - const BlasGenSettings *gset, - MatrixRole mrole) -{ - char tmp[1024]; - unsigned int vecLen; - char ldv[64]; - int shift; - char ptrLit; - char shiftMul[128]; - size_t tileWidth; - int widx; - KernelExtraFlags kflags = gset->kextra->flags; - - /* - * The matrix was made B inner if every thread should accesses their - * elements with a large stride but accesses elements of the matrix A - * sequentially to provide more coalesced memory accesses. - * Otherwise, the matrix A was made inner. - */ - widx = (!isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A) && - isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) ? 1 : 0; - - vecLen = getVecLen(gset, CLBLAS_GEMM, mrole); - shift = findHighestSetBit(vecLen); - if (mrole == MATRIX_A) { - tileWidth = gset->subdims[1].y; - ptrLit = 'A'; - if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) { - sprintf(ldv, "(lda >> %d)", shift); - } - else { - strcpy(ldv, "lda"); - } - } - else { - tileWidth = gset->subdims[1].x; - ptrLit = 'B'; - if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) { - sprintf(ldv, "(ldb >> %d)", shift); - } - else { - strcpy(ldv, "ldb"); - } - widx = 1 - widx; - } - - if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, mrole)) { - if (tileWidth / vecLen > 1) { - sprintf(shiftMul, " * %lu", tileWidth / vecLen); - } - else { - shiftMul[0] = '\0'; - } - // Alternative calculate global thead id to eliminate Channel Conflicts. - if (mrole == MATRIX_B) { - int bankSize = 2048; - int dataSize = 0; - int grShift; - - DataType dtype = gset->kextra->dtype; - switch (dtype) { - case TYPE_FLOAT: dataSize = 4; break; - case TYPE_COMPLEX_DOUBLE: dataSize = 16; break; - default: dataSize = 8; break; - } - - grShift = bankSize/ dataSize; - - sprintf(tmp, - "get_group_id_%d = (get_group_id(0) + get_group_id(1))" - "%% get_num_groups(%d);\n", widx, widx); - kgenAddStmt(ctx, tmp); - - sprintf(tmp, - "get_global_id_%d = get_group_id_%d * get_local_size(%d) " - "+ get_local_id(%d);\n",widx, widx, widx, widx); - kgenAddStmt(ctx, tmp); - - - sprintf(tmp, - "kif = (N %% %d != 0);\n" - "get_global_id_%d = (kif*(uint)get_global_id(%d)) + " - "((1-kif)*get_global_id_%d);\n",grShift, widx, widx, widx); - kgenAddStmt(ctx, tmp); - - sprintf(tmp, - "%c += get_global_id_%d%s;", - ptrLit, widx, shiftMul); - } - else { - sprintf(tmp, "%c += (uint)get_global_id(%d)%s;\n", - ptrLit, widx, shiftMul); - } - - } - else { - sprintf(tmp, "%c += %luu * (uint)get_global_id(%d) * %s;\n", - ptrLit, tileWidth, widx, ldv); - } - kgenAddStmt(ctx, tmp); -} - -static void -genShiftPointers( - struct KgenContext *ctx, - const BlasGenSettings *gset, - KernelExtraFlags kflags, - bool vectorizedPtrs) -{ - char tmp[1024]; - unsigned int flags[3] = {KEXTRA_A_OFF_NOT_ZERO, KEXTRA_BX_OFF_NOT_ZERO, - KEXTRA_CY_OFF_NOT_ZERO}; - char ptrNames[3] = {'A', 'B', 'C'}; - const char *offNames[3] = {"offA", "offB", "offC"}; - MatrixRole mroles[3] = {MATRIX_A, MATRIX_B, MATRIX_C}; - int i; - - for (i = 0; i < 3; i++) { - if (kflags & flags[i]) { - unsigned int vecLen; - - vecLen = getVecLen(gset, CLBLAS_GEMM, mroles[i]); - - if( vectorizedPtrs && (vecLen > 1) ) { - sprintf(tmp, "%c += %s / %u;\n", - ptrNames[i], offNames[i], vecLen); - } - else { - sprintf(tmp, "%c += %s;\n", ptrNames[i], offNames[i]); - } - kgenAddStmt(ctx, tmp); - } - } -} - -//----------------------------------------------------------------------------- - -static void -sprintfOffABC( - char *str, - KernelExtraFlags kflags) -{ - str[0] = '\0'; - if (kflags & KEXTRA_A_OFF_NOT_ZERO) { - str += sprintf(str, ",\n const uint offA"); - } - if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { - str += sprintf(str, ",\n const uint offB"); - } - if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { - str += sprintf(str, ",\n const uint offC"); - } -} - -static void -declareKernel( - struct KgenContext *ctx, - const BlasGenSettings *gset, - const char *nameSuffix) -{ - char tmp[4096]; - char offABC[1024]; - char fpref; - char *tnameA, *tnameB; - const char *tnameC; - const char *rawType; - DataType dtype = gset->kextra->dtype; - unsigned int vecLen; - const PGranularity *pgran = gset->pgran; - - fpref = dtypeToBlasPrefix(dtype); - rawType = dtypeBuiltinType(dtype); - vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A); - getVectorTypeName(dtype, vecLen, (const char **)&tnameA, NULL); - vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B); - getVectorTypeName(dtype, vecLen, (const char **)&tnameB, NULL); - - // FIXME - take into account flag BGF_LD_IN_VECTORS - //sprintf( tnameC, "%s", rawType ); - getVectorTypeName( dtype, - getVecLen( gset, 0, MATRIX_C ), - &tnameC, - NULL ); - - sprintfOffABC(offABC, gset->kextra->flags); - - sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n" - "void __kernel\n" - "%cgemm%s(\n" - " uint M,\n" - " uint N,\n" - " uint K,\n" - " const %s alpha,\n" - " const %s beta,\n" - " const __global %s *restrict A,\n" - " const __global %s *restrict B,\n" - " __global %s *C,\n" - " uint lda,\n" - " uint ldb,\n" - " uint ldc%s)\n", - pgran->wgSize[0], pgran->wgSize[1], fpref, nameSuffix, - rawType, rawType, tnameA, tnameB, tnameC, offABC); - - kgenDeclareFunction(ctx, tmp); -} - -//----------------------------------------------------------------------------- - -static void -genHitMatrixCheck( - struct KgenContext *ctx, - KernelExtraFlags kflags) -{ - /* tails of upper level blocks */ - bool tailsM = kflags & KEXTRA_TAILS_M; - bool tailsN = kflags & KEXTRA_TAILS_N; - - if (tailsM) { - if (tailsN) { - kgenAddStmt(ctx, "if ((coord.y >= M) || (coord.x >= N)) {\n"); - } - else { - kgenAddStmt(ctx, "if (coord.y >= M) {\n"); - } - } - else { - if (tailsN) { - kgenAddStmt(ctx, "if (coord.x >= N) {\n"); - } - } - - if (tailsM || tailsN) { - kgenAddStmt(ctx, " return;\n}\n\n"); - } -} - -//----------------------------------------------------------------------------- - -static ssize_t -blockGen( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - struct KgenContext *ctx; - CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; - KernelExtraFlags kflags = kextra->flags; - bool isRelA, isRelB; - bool tailsK = ((kflags & KEXTRA_TAILS_K_LOWER) != 0); - DataType dtype = kextra->dtype; - char tmp[2048]; - bool doubleBased = isDoubleBasedType(dtype); - BlasGenSettings gset; - KernelVarNames *vnames = &gset.varNames; - TileMulOpts mulOpts; - ssize_t ret; - char globalIdB[64]; - const char *alignedK; - FetchAddrMode addrMode, addrMask = 0; - FetchOpts fopts; - TilePostFetchPrivate pfPriv; - TailStatus tailStatus; - UpdateResultFlags upFlags; - unsigned int i; - unsigned int vecLen; - int isColMajA; - int isColMajB; - - memset(&gset, 0, sizeof(gset)); - memset(&mulOpts, 0, sizeof(mulOpts)); - memset(&pfPriv, 0, sizeof(pfPriv)); - memset(&fopts, 0, sizeof(fopts)); - - memcpy(gset.subdims, subdims, sizeof(gset.subdims)); - gset.flags = BGF_DISTINCT_VECLEN | BGF_LD_IN_VECTORS; - - // FIXME: throw the explicit constant away - switch (dtype) { - case TYPE_FLOAT: -// i = 12; - i = 16; - break; - case TYPE_COMPLEX_DOUBLE: - i = 6; - break; - default: - i = 8; - break; - } - - if (subdims[1].y + subdims[1].x <= i) { - gset.flags |= BGF_WHOLE_A; - } - gset.kextra = kextra; - gset.pgran = pgran; - //avoid [0].bw loop - gset.subdims[0].bwidth = gset.subdims[1].bwidth; - - mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) && - (dtype != TYPE_COMPLEX_FLOAT)) ? TILEMUL_MAD - : TILEMUL_MULADD; - mulOpts.memA = CLMEM_GLOBAL_MEMORY; - mulOpts.memB = CLMEM_GLOBAL_MEMORY; - mulOpts.fctx = createFetchContext(); - if (mulOpts.fctx == NULL) { - return -ENOMEM; - } - - ctx = createKgenContext(buf, buflen, true); - if (ctx == NULL) { - destroyFetchContext(mulOpts.fctx); - return -ENOMEM; - } - - isColMajA = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A); - isColMajB = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B); - - alignedK = (tailsK) ? "Kbase" : "K"; - - // setup kernel variables - vnames->A = "A"; - vnames->B = "B"; - vnames->C = "C"; - vnames->coordA = "coord.y"; - vnames->coordB = "coord.x"; - vnames->k = "coord.z"; - vnames->sizeK = alignedK; - vnames->sizeM = "M"; - vnames->sizeN = "N"; - vnames->lda = "lda"; - vnames->ldb = "ldb"; - vnames->ldc = "ldc"; - vnames->alpha = "alpha"; - vnames->beta = "beta"; - - // at first, generate needed declarations - ret = kgenDeclareUptrs(ctx, doubleBased); - - declareKernel(ctx, &gset, "Block"); - ret = kgenBeginFuncBody(ctx); - - if (tailsK) { - sprintf(tmp, "const uint Ktail = K %% %lu;\n" - "const uint Kbase = K - Ktail;\n", - subdims[1].bwidth); - kgenAddStmt(ctx, tmp); - alignedK = "Kbase"; - } - else { - alignedK = "K"; - } - - initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET); - declareTileStorages(ctx, &gset); - kgenAddStmt(ctx, "uint4 coord = 0u; /* contains coordB, coordA, k */\n"); - kgenAddBlankLine(ctx); - - vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_A); - if (vecLen > 1) { - kgenPrintf(ctx, "lda /= %u;\n", vecLen); - } - vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_B); - if (vecLen > 1) { - kgenPrintf(ctx, "ldb /= %u;\n", vecLen); - } - - /* - * The matrix was made B inner if every thread should accesses their - * elements with a large stride but accesses elements of the matrix A - * sequentially to provide more coalesced memory accesses. - * Otherwise, the matrix A was made inner. - */ - i = (!isColMajA && isColMajB) ? 1 : 0; - - tailStatus = checkGenAdjustTailCoords(NULL, CLBLAS_GEMM, &gset, NULL); - - if (tailStatus & TAIL_A_RAISED) { - addrMask |= FETCH_ADDR_A_RELATIVE; - } - if (tailStatus & TAIL_B_RAISED) { - addrMask |= FETCH_ADDR_B_RELATIVE; - } - - enableFetchOptLevels(mulOpts.fctx, FOPTLEV_MERGE_FETCHES); - addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask, - tailStatus, false); - isRelA = ((addrMode & FETCH_ADDR_A_RELATIVE) != 0); - isRelB = ((addrMode & FETCH_ADDR_B_RELATIVE) != 0); - - // Alternative calculate global thead id to eliminate Channel conflicts - if (isRelB && - isMatrixAccessColMaj(CLBLAS_GEMM, gset.kextra->flags, MATRIX_B)) { - - sprintf(globalIdB, "get_global_id_%d", 1-i); - sprintf(tmp, - "uint kif;\n" - "uint get_group_id_%d;\n" - "uint get_global_id_%d;\n",1-i, 1-i); - kgenAddStmt(ctx, tmp); - } - else { - sprintf(globalIdB, "(uint)get_global_id(%d)", 1-i); - } - - if (!(isColMajA || isColMajB)) { - size_t tsize; - - tsize = dtypeSize(dtype); - sprintf(tmp, "coord.z = (get_local_id(0) %% 2 * %lu) %% %s;\n", - sizeof(cl_float8) / tsize, alignedK); - kgenAddStmt(ctx, tmp); - - /* - * Adjust fetch addressing mode. It is used staggered access. That - * means there is a starting offset along K and hence addressing - * in this dimension should be cycled. - */ - addrMode &= ~FETCH_ADDR_K_RELATIVE; - addrMode |= FETCH_ADDR_K_CYCLICAL; - setFetchAddrMode(mulOpts.fctx, addrMode & ~addrMask); - } - - if (isRelA) { - genSetupItemPtr(ctx, &gset, MATRIX_A); - } - if (isRelB) { - genSetupItemPtr(ctx, &gset, MATRIX_B); - } - - /* - * Setup coordinates and check if they don't exceed matrix - */ - - sprintf(tmp, "\n" - "coord.y = %luu * (uint)get_global_id(%d);\n" - "coord.x = %luu * (uint)%s;\n", - subdims[1].y, i, subdims[1].x, globalIdB); - kgenAddStmt(ctx, tmp); - - genHitMatrixCheck(ctx, kflags); - genShiftPointers(ctx, &gset, kflags, true); - genZeroTile(ctx, &gset.tileCY); - - tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_GEMM, &gset, NULL); - - mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) - ? TILEMUL_MAD - : TILEMUL_MULADD; - - mulOpts.flags |= TILEMUL_EXTERN_RDECL; - mulOpts.flags |= kextraToTilemulFlags(CLBLAS_GEMM, kflags); - - sprintf(tmp, "for (uint k1 = 0; k1 < %s; k1 += %lu)", - alignedK, subdims[1].bwidth); - - prepareFetchLoop(ctx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY, - CLMEM_GLOBAL_MEMORY); - - kgenBeginBranch(ctx, tmp); - ret = tileMulGen(ctx, &gset, &mulOpts); - if (ret != 0) { - goto out; - } - kgenEndBranch(ctx, NULL); // 0..K loop - kgenAddBlankLine(ctx); - - //Optionally handle tails along K - if (tailsK) { - setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask, - tailStatus, true); - - vnames->sizeK = "K"; - pfPriv.fetchNumA = 0; - pfPriv.wholeA = 0; - pfPriv.funcID = CLBLAS_GEMM; - pfPriv.gset = &gset; - mulOpts.postFetch = defaultTilePostFetch; - mulOpts.postFetchPriv = &pfPriv; - - if (!(isColMajA || isColMajB)) { - kgenAddStmt(ctx, "coord.z = Kbase;\n"); - } - - sprintf(tmp, "for (uint k1 = 0u; k1 < Ktail; k1 += %luu)", - subdims[1].bwidth); - kgenBeginBranch(ctx, tmp); - ret = tileMulGen(ctx, &gset, &mulOpts); - if (ret != 0) { - goto out; - } - kgenEndBranch(ctx, NULL); // 0..Ktail loop - kgenAddBlankLine(ctx); - } - - gset.kextra = kextra; - checkGenRestoreTailCoords(ctx, &gset, tailStatus); - - upFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags); - upFlags |= tailStatusToUpresFlags(tailStatus); - upFlags |= UPRES_INDEXING_WITH_CONSTANTS; - genResultUpdateWithFlags(ctx, CLBLAS_GEMM, &gset, upFlags, - NULL, NULL, NULL); - - kgenEndFuncBody(ctx); - ret = kgenAddBlankLine(ctx); - - if (!ret) { - ret = (ssize_t)kgenSourceSize(ctx) + 1; - } - -out: - destroyFetchContext(mulOpts.fctx); - destroyKgenContext(ctx); - - return (ret < 0) ? -EOVERFLOW : ret; -} - -//----------------------------------------------------------------------------- - -/* the generator for subgroup access pattern - (used when A and B matrices are accessed row-major)*/ -static ssize_t -subgGen( - char *pBuf, - size_t buflen, - const struct SubproblemDim *pSubDims, - const struct PGranularity *pPGran, - void *pExtra ) -{ - struct KgenContext *pCtx; - CLBLASKernExtra *pKExtra = (CLBLASKernExtra*)pExtra; - KernelExtraFlags kflags = pKExtra->flags; - DataType dtype = pKExtra->dtype; - size_t staggered = ((extraData_t*)&pKExtra->solverPriv)->staggered; - char tmp[2048]; - BlasGenSettings gset; - TileMulOpts mulOpts; - ssize_t ret; - FetchOpts fopts; - TilePostFetchPrivate pfPriv; - UpdateResultFlags upResFlags = 0; - TailStatus tailStatus; - FetchAddrMode addrMode; - Kstring exprK; - SubgVarNames subVNames; - - KernelVarNames *vnames = NULL; - const char *alignedK; - - unsigned int vecLenA; - - bool isDoubleBased = isDoubleBasedType(dtype); - - bool tailsLowerK = ( (kflags & KEXTRA_TAILS_K_LOWER) != 0 ); - bool tailsM = ( (kflags & KEXTRA_TAILS_M) != 0 ); - bool tailsN = ( (kflags & KEXTRA_TAILS_N) != 0 ); - bool tailsLowerM = ( (kflags & KEXTRA_TAILS_M_LOWER) != 0 ); - bool tailsLowerN = ( (kflags & KEXTRA_TAILS_N_LOWER) != 0 ); - - unsigned int subgroupsA = 0; - unsigned int subgroupsB = 0; - - memset(&gset, 0, sizeof(gset)); - memset(&mulOpts, 0, sizeof(mulOpts)); - memset(&pfPriv, 0, sizeof(pfPriv)); - memset(&fopts, 0, sizeof(fopts)); - - memcpy( gset.subdims, pSubDims, sizeof(gset.subdims) ); - gset.pgran = pPGran; - gset.flags = BGF_DISTINCT_VECLEN | BGF_WHOLE_A | BGF_LD_IN_VECTORS; - gset.kextra = pKExtra; - - vnames = &gset.varNames; - // setting the basic names for kernel variables - vnames->A = "A"; - vnames->B = "B"; - vnames->C = "C"; - vnames->LDS = "scratch"; - vnames->sizeM = "M"; - vnames->sizeN = "N"; - vnames->lda = "lda"; - vnames->ldb = "ldb"; - vnames->ldc = "ldc"; - - vnames->alpha = "alpha"; - vnames->beta = "beta"; - - vnames->vectCoordA = "vca"; - vnames->vectCoordB = "vcb"; - vnames->k = exprK.buf; - - subgroupsA = (unsigned int)(gset.subdims[0].y/gset.subdims[1].y); - subgroupsB = (unsigned int)(gset.subdims[0].x/gset.subdims[1].x); - - initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET); - - vecLenA = gset.tileA.vecLen; - - // channel offset based coordinate - ksprintf(&exprK, "( (uint)(get_group_id(0))*%lu + k )", staggered/vecLenA*vecLenA); - - // starting code generation-------------------------------------------------- - pCtx = createKgenContext(pBuf, buflen, true); - if ( pCtx == NULL) { - return -ENOMEM; - } - - //define required macros - /* B_BLK_H should be one of common vector sizes, - as matrix C is accessed by vectors of this length*/ - sprintf(tmp,"#define A_BLK_H %lu\n",gset.subdims[1].y); - kgenAddStmt(pCtx,tmp); - sprintf(tmp,"#define B_BLK_H %lu\n",gset.subdims[1].x); - kgenAddStmt(pCtx,tmp); - sprintf(tmp,"#define SUBG_ITEMS %d\n",pPGran->wgSize[0]); - kgenAddStmt(pCtx,tmp); - - sprintf(tmp,"#define SUBG_A %d\n",subgroupsA); - kgenAddStmt(pCtx,tmp); - sprintf(tmp,"#define SUBG_B %d\n",subgroupsB); - kgenAddStmt(pCtx,tmp); - - kgenAddBlankLine(pCtx); - - kgenAddStmt(pCtx,tmp); - sprintf( - tmp, - "#define K_VLEN_A %u\n" - "#define K_VLEN_B %u\n", - getVecLen(&gset, CLBLAS_GEMM, MATRIX_A), - getVecLen(&gset, CLBLAS_GEMM, MATRIX_B)); - - kgenAddStmt(pCtx,tmp); - kgenAddBlankLine(pCtx); - - // Declare pointer unions - kgenDeclareUptrs(pCtx, isDoubleBased); - kgenAddBlankLine(pCtx); - - // declaring kernel function - declareKernel( pCtx, &gset, "Subgroup" ); - ret = kgenBeginFuncBody( pCtx ); - // kernel generation steps: - - // register variables declarations----------------------------------------- - - // K tail - // if postfetch should be engaged, generate tail code for - // whole subgroup, otherwise tail is handled by main cycle. - if( tailsLowerK ){ - sprintf(tmp, - "uint Ktail = K %% %lu;\n" - "uint Kbase = K - Ktail;\n", - pSubDims[0].bwidth); - - kgenAddStmt(pCtx, tmp); - alignedK = "Kbase"; - } - else { - alignedK = "K"; - } - vnames->sizeK = alignedK; - - declareTileStorages(pCtx, &gset); - - // scaling leading dims - // If lower-K tails need to be handled, vectorized access is disabled - // scaling is performed by factor 1 - sprintf(tmp, "%s /= K_VLEN_A;\n", vnames->lda); - kgenAddStmt(pCtx, tmp); - sprintf(tmp, "%s /= K_VLEN_B;\n", vnames->ldb); - kgenAddStmt(pCtx, tmp); - - //declare variables for subgroup mode - subVNames.itemId = "itemId"; - - kgenAddBlankLine( pCtx ); - - kgenPrintf( pCtx, "int2 %s;\n", subVNames.itemId ); - - // item id - kgenPrintf( pCtx, - "%s.x = get_local_id(0);\n", - subVNames.itemId ); - - // subgroup id - kgenPrintf( pCtx, - "%s.y = get_local_id(1);\n", - subVNames.itemId ); - - kgenAddBlankLine( pCtx ); - - // coordinate variables - vnames->coordA = "coordY"; - vnames->coordB = "coordX"; - - // generate offsets - genShiftPointers( pCtx, &gset, kflags, true ); - - // FIXME add new subgroup variables support - sprintf(tmp, "int %s = " - "A_BLK_H*( " - "get_group_id(1)*SUBG_A + " - "get_local_id(1)/SUBG_B );\n", - vnames->coordA); - kgenAddStmt(pCtx, tmp); - - sprintf(tmp, "int %s = " - "B_BLK_H*( " - "get_group_id(0)*SUBG_B + " - "get_local_id(1)%%SUBG_B );\n", - vnames->coordB); - - kgenAddStmt(pCtx, tmp); - kgenAddBlankLine(pCtx); - - // Block M N tails. Drop excess blocks ------------------------------------ - kgenAddStmt(pCtx,"uint skipTileMul = 0;\n"); - //M - if( tailsM ){ - - kgenAddStmt(pCtx,"//M block tail\n"); - - sprintf(tmp, - "if( %s >= %s )", - vnames->coordA, - vnames->sizeM); - - kgenBeginBranch( pCtx,tmp ); - kgenAddStmt(pCtx,"skipTileMul = 1;\n"); - kgenEndBranch(pCtx,NULL); - - } - - //N - if( tailsN ){ - - kgenAddStmt(pCtx,"//N block tail\n"); - - sprintf(tmp, - "if( %s >= %s )", - vnames->coordB, - vnames->sizeN); - - kgenBeginBranch( pCtx,tmp ); - kgenAddStmt(pCtx,"skipTileMul = 1;\n"); - kgenEndBranch(pCtx,NULL); - - } - kgenAddBlankLine(pCtx); - - //"Lower" tails - if( tailsLowerM || tailsLowerN ){ - kgenAddStmt(pCtx, "//Raising \"Lower\" M N tails\n"); - } - tailStatus = checkGenAdjustTailCoords(pCtx, CLBLAS_GEMM, &gset, NULL); - - // A, B pointers----------------------------------------------------------- - - sprintf(tmp, - "A += %s*%s;\n", - vnames->lda, - vnames->coordA); - - kgenAddStmt(pCtx, tmp); - - sprintf(tmp, - "B += %s*%s;\n", - vnames->ldb, - vnames->coordB); - - kgenAddStmt(pCtx, tmp); - - // calculated in vectors, C access is aligned to. - // if row of C-block is splitted into smaller vectors - - // multiply offset by number of these vectors - - kgenAddBlankLine(pCtx); - - genZeroTile( pCtx, &gset.tileCY ); - - kgenAddBlankLine(pCtx); - kgenAddBlankLine(pCtx); - - mulOpts.fctx = createFetchContext(); - if (mulOpts.fctx == NULL) { - destroyKgenContext(pCtx); - return -ENOMEM; - } - - enableFetchOptLevels(mulOpts.fctx, - FOPTLEV_CAN_SHARE_TMP_AB); - - addrMode = setDefaultFetchAddrMode(mulOpts.fctx, - &gset, - FETCH_ADDR_K_RELATIVE, - tailStatus, - false); - - addrMode |= FETCH_ADDR_A_RELATIVE | - FETCH_ADDR_B_RELATIVE | - FETCH_ADDR_K_CYCLICAL; - - setFetchAddrMode(mulOpts.fctx, addrMode); - prepareFetchLoop(pCtx, - mulOpts.fctx, - &gset, - CLMEM_GLOBAL_MEMORY, - CLMEM_GLOBAL_MEMORY); - - if( tailsM || tailsN ){ - kgenBeginBranch(pCtx,"if( !skipTileMul )"); - } - - sprintf(tmp, - "for(int k = %u*get_local_id(0); k < %s; k += %u*SUBG_ITEMS)", - vecLenA, - alignedK, - vecLenA); - kgenBeginBranch( pCtx, tmp ); - - // tiles multiplier-------------------------------------------------------- - - mulOpts.memA = CLMEM_GLOBAL_MEMORY; - mulOpts.memB = CLMEM_GLOBAL_MEMORY; - - mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : - TILEMUL_MULADD; - - mulOpts.flags = kextraToTilemulFlags( CLBLAS_GEMM, kflags ); - mulOpts.flags |= TILEMUL_EXTERN_RDECL; - mulOpts.flags |= TILEMUL_NOT_INC_K; - mulOpts.flags |= TILEMUL_BW_STRIDE; - /* both matrices are accessed row - major */ - mulOpts.flags |= TILEMUL_TRB; - - ret = tileMulGen( pCtx, &gset, &mulOpts ); - if (ret != 0) { - goto out; - } - - kgenEndBranch(pCtx, NULL); - kgenAddBlankLine(pCtx); - - // K - Tail - if ( tailsLowerK ) { - setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_TAILK_PADD); - - vnames->sizeK = "K"; - vnames->k = "k"; - - kgenPrintf(pCtx, - "uint %s = %s + get_local_id(0)*%u;\n", - vnames->k, - alignedK, - vecLenA); - - pfPriv.fetchNumA = 0; - pfPriv.wholeA = 0; - pfPriv.funcID = CLBLAS_GEMM; - pfPriv.gset = &gset; - mulOpts.postFetch = defaultTilePostFetch; - mulOpts.postFetchPriv = &pfPriv; - - kgenBeginBranch(pCtx, NULL); - ret = tileMulGen(pCtx, &gset, &mulOpts); - if (ret != 0) { - goto out; - } - kgenEndBranch(pCtx, NULL); - } - - if( tailsM || tailsN ){ - kgenEndBranch(pCtx, NULL); // skip tilemul condition - } - kgenAddBlankLine(pCtx); - - upResFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags) | - tailStatusToUpresFlags(tailStatus); - // restore coordinates, if tail was raised - checkGenRestoreTailCoords(pCtx, &gset, tailStatus); - // merge and update result - mergeUpdateResult( pCtx, - CLBLAS_GEMM, - &gset, - &subVNames, - upResFlags | - UPRES_EXCEED_PROBLEM_CONDITION | - UPRES_INDEXING_WITH_CONSTANTS, - (UpresProcPtr)genResultUpdateWithFlags ); - kgenEndFuncBody(pCtx); - - if (!ret) { - ret = (ssize_t)kgenSourceSize(pCtx) + 1; - } - -out: - destroyFetchContext(mulOpts.fctx); - destroyKgenContext(pCtx); - - return (ret < 0) ? -EOVERFLOW : ret; -} - -//----------------------------------------------------------------------------- - -static void -assignBlockKargs(KernelArg *args, const void *params, const void *extra) -{ - CLBlasKargs *blasArgs = (CLBlasKargs*)params; - KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; - int idx; - (void)extra; - - initSizeKarg(&args[0], blasArgs->M); - initSizeKarg(&args[1], blasArgs->N); - initSizeKarg(&args[2], blasArgs->K); - assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); - assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype); - INIT_KARG(&args[5], blasArgs->A); - INIT_KARG(&args[6], blasArgs->B); - INIT_KARG(&args[7], blasArgs->C); - initSizeKarg(&args[8], blasArgs->lda.matrix); - initSizeKarg(&args[9], blasArgs->ldb.matrix); - initSizeKarg(&args[10], blasArgs->ldc.matrix); - idx = 11; - if (kflags & KEXTRA_A_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offA); - } - if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offBX); - } - if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offCY); - } -} - -static bool -blockCheckCalcDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - DataType dtype, - int check) -{ - bool ret = true; - bool ret_multiple = false; - int i; - - DUMMY_ARG_USAGE(subdimsNum); - - if (check == PGRAN_CHECK) { - unsigned int minSize, maxSize; - - maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; - minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; - ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); - ret = ret && (subdims[0].bwidth == subdims[1].bwidth); - for(i = 0; i < ( (pgran->maxWorkGroupSize) / (pgran->wfSize) ); i++) - { - // returns true if wgSize[0] * wgSize[1] is multiples of the 64 but not bigger than maxWorkGroupSize - ret_multiple = ret_multiple || ( pgran->wgSize[0] * pgran->wgSize[1] == pgran->wfSize * (i + 1) ); - } - ret = ret && ret_multiple; - } - else { - calcPgranDedicated(pgran, subdims, 1, 3); - } - - return ret; -} - -//----------------------------------------------------------------------------- - -static void -assignSubgKargs(KernelArg *args, const void *params, const void *extra) -{ - CLBlasKargs *blasArgs = (CLBlasKargs*)params; - KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; - int idx = 0; - (void)extra; - - initSizeKarg(&args[0], blasArgs->M); - initSizeKarg(&args[1], blasArgs->N); - initSizeKarg(&args[2], blasArgs->K); - assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); - assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype); - INIT_KARG(&args[5], blasArgs->A); - INIT_KARG(&args[6], blasArgs->B); - INIT_KARG(&args[7], blasArgs->C); - initSizeKarg(&args[8], blasArgs->lda.matrix); - initSizeKarg(&args[9], blasArgs->ldb.matrix); - initSizeKarg(&args[10], blasArgs->ldc.matrix); - idx = 11; - if (kflags & KEXTRA_A_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offA); - } - if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offBX); - } - if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { - initSizeKarg(&args[idx++], blasArgs->offCY); - } - - return; -} - -//----------------------------------------------------------------------------- - -static DecompositionAxis -innerDecompositionAxis(const void *args) -{ - const CLBlasKargs *kargs = args; - int tra, trb; - - tra = (kargs->order == clblasColumnMajor) ^ - (kargs->transA != clblasNoTrans); - trb = (kargs->order == clblasRowMajor) ^ - (kargs->transB != clblasNoTrans); - - /* - * Make the matrix B inner if every thread should access their elements - * with a large stride but accesses elements of the matrix A sequentially - * to provide more coalesced memory accesses. - */ - return (!tra && trb) ? DECOMP_AXIS_X : DECOMP_AXIS_Y; -} - -//----------------------------------------------------------------------------- - -static SolverFlags -solverFlags(void) -{ - return (SF_WSPACE_2D); -} - -//----------------------------------------------------------------------------- - -static void -fixupArgs(void *args, SubproblemDim *subdims, void *extra) -{ - CLBlasKargs *kargs = (CLBlasKargs*)args; - extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; - - const size_t nChans = 8; // !!!DEVICE DEPENDED!!! - const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! - const size_t sizeType[] = {1,2,2,4}; - - size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype]; - size_t off = kargs->K % sizeBlock; - if (off == 0) { - extraData->staggered = roundUp(subdims[1].bwidth * sizeType[kargs->dtype] - , wideChans / sizeType[kargs->dtype]); - } - else { - extraData->staggered = 0; - } -} - -//----------------------------------------------------------------------------- - -void -InitGEMMCachedBlockPattern(MemoryPattern *mempat) -{ - mempat->name = "Cached global memory based block gemm"; - mempat->nrLevels = 2; - mempat->cuLevel = 0; - mempat->thLevel = 1; - mempat->sops = &blockSOps; - - mpatExtra.aMset = CLMEM_LEVEL_L1; - mpatExtra.bMset = CLMEM_LEVEL_L1; - mpatExtra.mobjA = CLMEM_BUFFER; - mpatExtra.mobjB = CLMEM_BUFFER; - mempat->extra = &mpatExtra; -} - -//----------------------------------------------------------------------------- - -static int -blockGetPerf( - unsigned int kflags, - const void *args) -{ - (void)args; - - if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) && - !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){ - - return PPERF_AVERAGE; - } - - return PPERF_GOOD; -} - -//----------------------------------------------------------------------------- - -void -InitGEMMCachedSubgroupPattern(MemoryPattern *mempat) -{ - mempat->name = "Cached global memory based subgroup gemm"; - mempat->nrLevels = 2; - mempat->cuLevel = 0; - mempat->thLevel = 1; - mempat->sops = &subgSOps; - - mpatExtra.aMset = CLMEM_LEVEL_L1; - mpatExtra.bMset = CLMEM_LEVEL_L1; - mpatExtra.mobjA = CLMEM_BUFFER; - mpatExtra.mobjB = CLMEM_BUFFER; - mempat->extra = &mpatExtra; -} - -//----------------------------------------------------------------------------- - -static int -gemmSubgGetDefaultDecomp( PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - void *pArgs ) -{ - DUMMY_ARG_USAGE(subdimsNum); - pgran->wgDim = 2; - return subgGetDefaultDecomp( pgran, subdims, pArgs ); -} - -//----------------------------------------------------------------------------- - -static bool -subgCheckCalcDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - unsigned int subdimsNum, - DataType dtype, - int check) -{ - unsigned int subgroupsA = 0; - unsigned int subgroupsB = 0; - unsigned int itemsPerSubg = 0; - unsigned int regUse = 0; - - //EINVAL - if( (subdimsNum<2)|| - (NULL==pgran)|| - (NULL==subdims) ){ - - return false; - } - - if( 0 == subdims[0].x || - 0 == subdims[0].y || - 0 == subdims[0].bwidth || - 0 == subdims[1].x || - 0 == subdims[1].y || - 0 == subdims[1].bwidth ){ - - return false; - } - - if( subdims[1].x != subdims[1].itemX || - subdims[1].y != subdims[1].itemY ){ - - return false; - } - - // the group block must consist of integer number of subgroup blocks - if( subdims[0].x % subdims[1].x || - subdims[0].y % subdims[1].y || - subdims[0].bwidth % subdims[1].bwidth ){ - - return false; - } - - if( !(isDoubleBasedType(dtype) && isComplexType(dtype) ) ){ - - if ( subdims[1].x < 2 || subdims[1].y < 2 || subdims[1].bwidth < 2 ) { - - return false; - } - } - - // check dimensions - if( subdims[1].bwidth > 8 || - subdims[1].x > 8 || - subdims[1].y > 8 ){ - - return false; - } - - // estimate register usage, drop - // inevitably slowed decompositions - regUse = - ( subdims[1].bwidth * subdims[1].x + - subdims[1].bwidth * subdims[1].y + - subdims[1].x * subdims[1].y ) * - dtypeSize(dtype); - - regUse /= 16; // 16 bytes per register - - if( regUse >= 50 ){ - return false; - } - - // validate the subgroup decomposition - itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; - - subgroupsA = subdims[0].y/subdims[1].y; - subgroupsB = subdims[0].x/subdims[1].x; - - // passed PGranularity should be checked - if( PGRAN_CHECK == check ){ - - if( pgran->wgSize[0] != itemsPerSubg || - pgran->wgSize[1] != subgroupsA*subgroupsB ){ - - return false; - } - - //filter subgroup numbers with poor performance - //(less than 2 items in subgroup) - if( pgran->wgSize[0] < 2 ){ - return false; - } - - // drop groups consisting of number of items other than 64 - if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){ - return false; - } - } - // PGranularity should be calculated - else{ - pgran->wgSize[0] = itemsPerSubg; - pgran->wgSize[1] = subgroupsA*subgroupsB; - } - - pgran->wgDim = 2; - - /*Debug out for Tune*/ - - return true; -} - -//----------------------------------------------------------------------------- - -static void -subgCalcGlobalThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *extra -) -{ - CLBlasKargs *pArgs; - - //EINVAL - if( NULL == subdims || - NULL == pgran || - NULL == args || - NULL == extra) - { - return; - } - pArgs = (CLBlasKargs*)args; - - threads[0] = (pArgs->N/subdims[0].x)*pgran->wgSize[0]; - threads[1] = (pArgs->M/subdims[0].y)*pgran->wgSize[1]; - - // N tail group - if( pArgs->N%subdims[0].x ){ - threads[0] += pgran->wgSize[0]; - } - // M tail group - if( pArgs->M%subdims[0].y ){ - threads[1] += pgran->wgSize[1]; - } -} - -//----------------------------------------------------------------------------- -static int -subgGetPerf( - unsigned int kflags, - const void *args) -{ - DUMMY_ARG_USAGE(args); - - if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) && - !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){ - - return PPERF_GOOD; - } - - return PPERF_NOT_SUPPORTED; -} |