diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/legacy/gemm_img.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/legacy/gemm_img.c | 758 |
1 files changed, 0 insertions, 758 deletions
diff --git a/external/clBLAS/src/library/blas/gens/legacy/gemm_img.c b/external/clBLAS/src/library/blas/gens/legacy/gemm_img.c deleted file mode 100644 index 9fa19c15..00000000 --- a/external/clBLAS/src/library/blas/gens/legacy/gemm_img.c +++ /dev/null @@ -1,758 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -/* - * gemm image based generators - */ - -#include <string.h> -#include <stdio.h> -#include <math.h> -#include <clBLAS.h> -#include <matrix_dims.h> -#include <blas_mempat.h> -#include <clkern.h> -#include <clblas-internal.h> -#include <dis_warning.h> - -#include "blas_kgen_legacy.h" -#include "../gen_helper.h" -#include "gen_helper_legacy.h" - -static CLBLASMpatExtra mpatExtra; - -static const char *prepareImagesGemmDeclA = - "void __kernel\n" - "%cprepareImageA(\n" - " clblasOrder order,\n" - " clblasTranspose transA,\n" - " uint M,\n" - " uint K,\n" - " __global %s *A,\n" - " uint lda,\n" - " __write_only image2d_t imgA,\n" - " uint offsetA)\n"; - -static const char *prepareImagesGemmDeclB = - "void __kernel\n" - "%cprepareImageB(\n" - " clblasOrder order,\n" - " clblasTranspose transB,\n" - " uint N,\n" - " uint K,\n" - " __global %s *B,\n" - " uint ldb,\n" - " __write_only image2d_t imgB,\n" - " uint offsetB)\n"; - - -static const char *imgGemmDecl = - "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n" - "void __kernel\n" - "%cgemmImg(\n" - " const uint M,\n" - " const uint N,\n" - " const uint K,\n" - " const %s alpha,\n" - " const __read_only image2d_t A,\n" - " const __read_only image2d_t B,\n" - " const %s beta,\n" - " __global %s *C,\n" - " const uint ldc,\n" - " const uint offsetC)\n"; - -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra); - -static ssize_t -preparator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra); - -static ssize_t -genWrapper( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; - if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) { - return generator(buf, buflen, subdims, pgran, extra); - } - else { - return preparator(buf, buflen, subdims, pgran, extra); - } -} - -static void -assignKargs(KernelArg *args, const void *params, const void *extra); - -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs); - -static SolverFlags -solverFlags(void); - -static void -calcNrThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *extra); - -static int -imgGetPerf( - unsigned int kflags, - const void *args); - -static SolverOps imgSops = { - genWrapper, - assignKargs, - isFitToLDS, - imgGetPerf, - NULL, - calcNrThreads, - NULL, - solverFlags, - NULL, //fixupKargs - NULL, //getDefaultDecomp - NULL, //getDecompList - NULL, - NULL -}; - -// Preparation function for images based kernel generator -static ssize_t -preparator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - struct KgenContext *ctx; - char tmp[4096], conjStr[1024]; - CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; - CopyImgFuncs copyImgFuncs; - DataType dtype = kextra->dtype; - BlasGenSettings gset; - unsigned int vecLen; - unsigned int tsize; - const char *typeName; - char fpref; - bool b; - size_t localBufSize; - ssize_t ret; - const char *conjCond; - - const char *functionHeadA = - "int tra, aligned;\n" - "const uint bpr = (K + %lu) / %lu;\n" - "uint m = (gid / bpr) * %lu;\n" - "uint k = (gid %% bpr) * %lu;\n" - "uint x, y;\n" - "__local %s temp[%lu];\n" - "\n" - "A += offsetA;\n" - "tra = (!transA && order == clblasColumnMajor) ||\n" - " (transA && order == clblasRowMajor);\n" - "if (m >= M) {\n" - " return;\n" - "}\n"; - - const char *functionHeadB = - "int trb, aligned;\n" - "const uint bpr = (K + %lu) / %lu;\n" - "const uint n = (gid / bpr) * %lu;\n" - "const uint k = (gid %% bpr) * %lu;\n" - "uint x, y;\n" - "__local %s temp[%lu];\n" - "\n" - "B += offsetB;\n" - "trb = (!transB && order == clblasRowMajor) ||\n" - " (transB && order == clblasColumnMajor);\n" - "if (n >= N) {\n" - " return;\n" - "}\n"; - - // Distribute blocks across compute units and copy matrix A to image. - // Transposition and filling with zeros in unaligned cases is made using - // buffer in local memory. - const char *copyToImageA = - "//copy matrix A block\n" - "y = m + %u <= M ? %u : M - m;\n" - "x = k + %u <= K ? %u : K - k;\n" - "aligned = (x == %u) && (y == %u) && %d;\n" - "int atcase = aligned * 10 + tra;\n" - "%s" // conjugated check - "if (atcase != 10) {\n" - " %s((__local float4*)temp);\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "}\n" - "switch(atcase) {\n" - "case 10: //aligned, not transposed\n" - " %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n" - " break;\n" - "%s" // conjugated case - "case 1: //not aligned, transposed\n" - " // generic transposed global to local\n" - " %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n" - " break;\n" - "case 0: //not aligned, not transposed\n" - " // generic global to local\n" - " %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n" - " break;\n" - "case 11: //aligned, transposed\n" - " // optimized transposed global to local\n" - " %s((LPtr) temp, (GPtr)A, k, m, lda);\n" - " break;\n" - "}\n" - "if (atcase != 10) {\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - " %s(imgA, k / %u, m, (LPtr) temp);\n" - "}\n" - "\n"; - - const char *copyToImageB = - "//copy matrix B block\n" - "y = n + %u <= N ? %u : N - n;\n" - "x = k + %u <= K ? %u : K - k;\n" - "aligned = (x == %u) && (y == %u) && %d;\n" - "int atcase = aligned * 10 + trb;\n" - "%s" // conjugated check - "if (atcase != 10) {\n" - " %s((__local float4*)temp);\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "}\n" - "switch (atcase) {\n" - "case 10: //aligned, not transposed\n" - " %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n" - " break;\n" - "%s" // conjugated case - "case 1: //not aligned, transposed\n" - " // generic transposed global to local\n" - " %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n" - " break;\n" - "case 0: //not aligned, not transposed\n" - " // generic global to local\n" - " %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n" - " break;\n" - "case 11: //transposed, aligned\n" - " // optimized transposed global to local\n" - " %s((LPtr)temp, (GPtr)B, k, n, ldb);\n" - " break;\n" - "}\n" - "if (atcase != 10) {\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - " %s(imgB, k / %u, n, (LPtr)temp);\n" - "}\n" - "\n"; - - memset(©ImgFuncs, 0, sizeof(copyImgFuncs)); - memset(&gset, 0, sizeof(gset)); - - ctx = createKgenContext(buf, buflen, true); - if (ctx == NULL) { - return -ENOMEM; - } - - tsize = dtypeSize(dtype); - - b = isDoubleBasedType(dtype); - kgenDeclareUptrs(ctx, b); - declareBlasEnums(ctx); - - memcpy(gset.subdims, subdims, sizeof(gset.subdims)); - gset.kextra = kextra; - gset.pgran = pgran; - - // generate necessary memory to image copying functions - generateImageCopyFuncs(©ImgFuncs, ctx, CLBLAS_GEMM, &gset); - - kgenAddBlankLine(ctx); - vecLen = sizeof(cl_float4) / dtypeSize(dtype); - typeName = dtypeBuiltinType(dtype); - fpref = dtypeToBlasPrefix(dtype); - - if (kextra->kernType == CLBLAS_PREP_A_KERNEL) { - sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName); - kgenDeclareFunction(ctx, tmp); - ret = kgenBeginFuncBody(ctx); - - // same local buffer is used for both matrix A and matrix B blocks - localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize); - localBufSize *= vecLen; - - kgenDeclareGroupID(ctx, "gid", pgran); - sprintf(tmp, functionHeadA, - subdims[1].bwidth - 1, subdims[1].bwidth, - subdims[1].y, subdims[1].bwidth, - typeName, localBufSize); - kgenAddStmt(ctx, tmp); - - if (isComplexType(dtype)) { - conjCond = "atcase += ((atcase == 10) && " - "(transA == clblasConjTrans)) ? 100 : 0;\n"; - sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" - " %s((LPtr)temp, (GPtr)A, m, k, lda);\n" - " break;\n", - copyImgFuncs.globalToLocal[MATRIX_A]); - } - else { - conjCond = ""; - strcpy(conjStr, ""); - } - - sprintf(tmp, copyToImageA, - subdims[1].y, subdims[1].y, // y = m + dy <= M ?... - subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... - subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1) - (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0, - conjCond, - copyImgFuncs.zeroBlock[MATRIX_A], - copyImgFuncs.globalToImage[MATRIX_A], - vecLen, - conjStr, - copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A], - subdims[1].bwidth, - copyImgFuncs.globalToLocalGeneric[MATRIX_A], - subdims[1].bwidth, - copyImgFuncs.globalToLocalTransposed[MATRIX_A], - copyImgFuncs.localToImage[MATRIX_A], - vecLen); - kgenAddStmt(ctx, tmp); - } - else { // PREP_B - sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName); - kgenDeclareFunction(ctx, tmp); - ret = kgenBeginFuncBody(ctx); - - // same local buffer is used for both matrix A and matrix B blocks - localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize); - localBufSize *= vecLen; - - kgenDeclareGroupID(ctx, "gid", pgran); - sprintf(tmp, functionHeadB, - subdims[1].bwidth - 1, subdims[1].bwidth, - subdims[1].x, subdims[1].bwidth, - typeName, localBufSize); - kgenAddStmt(ctx, tmp); - - if (isComplexType(dtype)) { - conjCond = "atcase += ((atcase == 10) && " - "(transB == clblasConjTrans)) ? 100 : 0;\n"; - sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" - " %s((LPtr)temp, (GPtr)B, n, k, ldb);\n" - " break;\n", - copyImgFuncs.globalToLocal[MATRIX_B]); - } - else { - conjCond = ""; - strcpy(conjStr, ""); - } - - sprintf(tmp, copyToImageB, - subdims[1].x, subdims[1].x, // y = n + dy <= N ?... - subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... - subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1) - (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0, - conjCond, - copyImgFuncs.zeroBlock[MATRIX_B], - copyImgFuncs.globalToImage[MATRIX_B], - vecLen, - conjStr, - copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B], - subdims[1].bwidth, - copyImgFuncs.globalToLocalGeneric[MATRIX_B], - subdims[1].bwidth, - copyImgFuncs.globalToLocalTransposed[MATRIX_B], - copyImgFuncs.localToImage[MATRIX_B], - vecLen); - kgenAddStmt(ctx, tmp); - } - - kgenEndFuncBody(ctx); - - ret = kgenAddBlankLine(ctx); - - if (!ret) { - ret = (ssize_t)kgenSourceSize(ctx) + 1; - } - destroyKgenContext(ctx); - - return (ret < 0) ? -EOVERFLOW : ret; -} - -static void -initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) -{ - kvars->A = "imgA"; - kvars->B = "imgB"; - if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) { - kvars->coordA = "coordA.x"; - } - else { - kvars->coordA = "coordA.y"; - } - if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) { - kvars->coordB = "coordB.x"; - } - else { - kvars->coordB = "coordB.y"; - } - kvars->sizeM = "M"; - kvars->sizeN = "N"; - kvars->sizeK = "K"; -} - -// global memory based kernel generator -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - struct KgenContext *ctx; - CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; - char tmp[4096], tmp1[4096]; - char *p; - // is the iteration over N, N at the top level - const char *typeName; - char fpref; - DataType dtype = kextra->dtype; - ssize_t ret; - BlasGenSettings gset; - BlkMulOpts mulOpts; - unsigned int tsize; - unsigned int vecLen, outVecLen; - bool b; - const char *outTypeName; - unsigned int i; - unsigned int nrRegs, regPitch; - int tra, trb; - char vect[2] = {'y', 'x'}; - - const char *coordConstants = - "const uint workItemM = get_global_id(0) * %lu;\n" - "const uint workItemN = get_global_id(1) * %lu;\n" - "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" - "uint vectK = (K + %u) / %u;\n"; - - /* - * template for image based gemm preparation part - * for two dimensional work space - */ - const char *localVariables = - "uint k0;\n" - "int2 coordA = (int2)(0, workItemM);\n" - "int2 coordB = (int2)(0, workItemN);\n" - "%s c[%u];\n\n"; - - tsize = dtypeSize(dtype); - vecLen = sizeof(cl_float4) / dtypeSize(dtype); - if (isComplexType(dtype)) { - regPitch = (unsigned int)subdims[1].x; - } - else { - regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * - sizeof(cl_float4) / tsize; - } - - memset(&gset, 0, sizeof(gset)); - memcpy(gset.subdims, subdims, sizeof(gset.subdims)); - gset.kextra = kextra; - gset.pgran = pgran; - initKernelVarNames(&gset.varNames, kextra->flags); - - ctx = createKgenContext(buf, buflen, true); - if (ctx == NULL) { - return -ENOMEM; - } - - // at first, generate needed declarations and auxiliary functions - b = isDoubleBasedType(dtype); - kgenDeclareUptrs(ctx, b); - - typeName = dtypeBuiltinType(dtype); - fpref = dtypeToBlasPrefix(dtype); - - // now, generate the kernel - - sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, - typeName, typeName, typeName); - kgenDeclareFunction(ctx, tmp); - ret = kgenBeginFuncBody(ctx); - - // constants - sprintf(tmp, coordConstants, - subdims[1].y, subdims[1].x, subdims[1].y, - vecLen - 1, vecLen); - kgenAddStmt(ctx, tmp); - - /* - * Calculate local buffer pitches, and then declare local - * variables - */ - getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); - - sprintf(tmp, localVariables, outTypeName, nrRegs); - kgenAddStmt(ctx, tmp); - - // check if offset exceeds matrix - kgenAddStmt(ctx, "if ((workItemM >= M) ||" - "(workItemN >= N)) {\n" - " return;\n" - "}\n"); - - kgenAddStmt(ctx, "C += offsetC;\n"); - - // zero C block - sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" - " c[k0] = 0;\n" - "}\n\n", - nrRegs); - kgenAddStmt(ctx, tmp); - - // block multiplication inlined function - sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)", - subdims[1].bwidth / vecLen); - kgenBeginBranch(ctx, tmp); - - mulOpts.aMobj = CLMEM_IMAGE; - mulOpts.bMobj = CLMEM_IMAGE; - mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE; - if (isComplexType(dtype)) { - mulOpts.core = BLKMUL_SEPARATE_MULADD; - } - else { - mulOpts.core = BLKMUL_MAD; - } - mulOpts.argNames.coordA = "coordA"; - mulOpts.argNames.coordB = "coordB"; - mulOpts.argNames.skewCol = "skewCol"; - mulOpts.argNames.skewRow = "skewRow"; - mulOpts.argNames.k = "k0"; - mulOpts.argNames.vectBoundK = "vectK"; - ret = blkMulGen(ctx, subdims, dtype, &mulOpts); - if (ret) { - destroyKgenContext(ctx); - return -EOVERFLOW; - } - - // update image coordinates - sprintf(tmp, "\ncoordA.x += %lu;\n" - "coordB.x += %lu;\n", - subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); - kgenAddStmt(ctx, tmp); - - kgenEndBranch(ctx, NULL); - - // reorder the given solution - outVecLen = isComplexType(dtype) ? 1 : vecLen; - p = tmp1; - for (i = 0; i < regPitch / outVecLen; i++) { - unsigned int k = (unsigned int)(subdims[1].y - 1) * - regPitch / outVecLen + i; - - sprintf(p, "\n" - " tmp = c[%u];\n" - " for (j = %lu; j >= 0; j--) {\n" - " c[(j+1) * %u + %u] = c[j * %u + %u];\n" - " }\n" - " c[%u] = tmp;\n", - k, subdims[1].y - 2, regPitch / outVecLen, - i, regPitch / outVecLen, i, i); - p += strlen(p); - } - sprintf(tmp, "\n" - "for (k0 = 0; k0 < skewRow.y; k0++) {\n" - " int j;\n" - " %s tmp;\n" - "%s" - "}\n" - "\n", - outTypeName, tmp1); - kgenAddStmt(ctx, tmp); - - tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A); - trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B); - sprintf(tmp, "coordA.%c = workItemM;\n" - "coordB.%c = workItemN;\n\n", - vect[tra], vect[trb]); - kgenAddStmt(ctx, tmp); - - // write back the tile evaluated - generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL); - - kgenEndFuncBody(ctx); - ret = kgenAddBlankLine(ctx); - - if (!ret) { - ret = (ssize_t)kgenSourceSize(ctx) + 1; - } - - destroyKgenContext(ctx); - - return (ret < 0) ? -EOVERFLOW : ret; -} - -static void -assignKargs(KernelArg *args, const void *params, const void *extra) -{ - const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; - - (void)extra; - - switch (blasArgs->kernType) { - case CLBLAS_COMPUTING_KERNEL: - // arguments for computational kernel - initSizeKarg(&args[0], blasArgs->M); - initSizeKarg(&args[1], blasArgs->N); - initSizeKarg(&args[2], blasArgs->K); - assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); - INIT_KARG(&args[4], blasArgs->scimage[0]); - INIT_KARG(&args[5], blasArgs->scimage[1]); - assignScalarKarg(&args[6], &(blasArgs->beta), blasArgs->dtype); - initMemobjKarg(&args[7], blasArgs->C, NULL, 0, 0); - initSizeKarg(&args[8], blasArgs->ldc.matrix); - initSizeKarg(&args[9], blasArgs->offCY); - break; - case CLBLAS_PREP_A_KERNEL: - INIT_KARG(&args[0], blasArgs->order); - INIT_KARG(&args[1], blasArgs->transA); - initSizeKarg(&args[2], blasArgs->M); - initSizeKarg(&args[3], blasArgs->K); - initMemobjKarg(&args[4], blasArgs->A, NULL, 0, 0); - initSizeKarg(&args[5], blasArgs->lda.matrix); - INIT_KARG(&args[6], blasArgs->scimage[0]); - initSizeKarg(&args[7], blasArgs->offA); - break; - case CLBLAS_PREP_B_KERNEL: - INIT_KARG(&args[0], blasArgs->order); - INIT_KARG(&args[1], blasArgs->transB); - initSizeKarg(&args[2], blasArgs->N); - initSizeKarg(&args[3], blasArgs->K); - initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0); - initSizeKarg(&args[5], blasArgs->ldb.matrix); - INIT_KARG(&args[6], blasArgs->scimage[1]); - initSizeKarg(&args[7], blasArgs->offBX); - break; - default: - //this should not happen - break; - } -} - -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs) -{ - cl_ulong size; - const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; - size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side); - return (size * dtypeSize(dtype) <= ldsSize); -} - -static void -calcNrThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *extra) -{ - const CLBlasKargs *kargs = args; - (void)extra; - - if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) { - const size_t *whole, *part; - size_t nrGroups; - - // each thread gets one block - - if (kargs->kernType == CLBLAS_PREP_A_KERNEL) { - whole = &kargs->M; - part = &subdims[0].itemY; - } - else { - whole = &kargs->N; - part = &subdims[0].itemX; - } - - nrGroups = *whole / *part + (*whole % *part != 0); - nrGroups *= (kargs->K / subdims[0].bwidth + - (kargs->K % subdims[0].bwidth != 0)); - threads[0] = pgran->wgSize[0] * nrGroups; - threads[1] = pgran->wgSize[1]; - } - else { - calcGlobalThreads(threads, &subdims[0], pgran, kargs->M, kargs->N); - } -} - -static SolverFlags -solverFlags(void) -{ - return (SF_WSPACE_2D); -} - -void -initGemmImgPattern(MemoryPattern *mempat) -{ - mempat->name = "Image based block gemm"; - mempat->nrLevels = 2; - mempat->cuLevel = 0; - mempat->thLevel = 1; - mempat->sops = &imgSops; - - mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; - mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; - mpatExtra.mobjA = CLMEM_IMAGE; - mpatExtra.mobjB = CLMEM_IMAGE; - mempat->extra = &mpatExtra; -} - -static int -imgGetPerf( - unsigned int kflags, - const void *args) -{ - (void)args; - (void)kflags; - - return PPERF_POOR; -} |