summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp')
-rw-r--r--external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp462
1 files changed, 0 insertions, 462 deletions
diff --git a/external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp b/external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp
deleted file mode 100644
index ff144af9..00000000
--- a/external/clBLAS/src/library/blas/gens/gemm_tail_cached.cpp
+++ /dev/null
@@ -1,462 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-/*
- * Cached global buffers based gemm generator
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <clblas_stddef.h>
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-#include <kprintf.hpp>
-#include <gemm.clT>
-#include <symm_helper.clT>
-#include <solution_seq.h>
-
-extern "C" int
-gemmHasNTail(size_t N, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB);
-
-extern "C" int
-gemmHasMTail(size_t M, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB);
-
-
-//#define DEBUG_GEMM_TAIL
-static CLBLASMpatExtra mpatExtra;
-
-static char Prefix[4];
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-static void
-assignKargs(KernelArg *args, const void *params, const void *extra);
-
-static SolverFlags
-solverFlags(void);
-
-static void
-setBuildOpts(
- char * buildOptStr,
- const void *args);
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *subdims,
- const PGranularity *pgran,
- const void *args,
- const void *extra);
-
-static SolverOps gemmSops = {
- generator,
- assignKargs,
- NULL,
- NULL,
- NULL,
- calcNrThreads,
- NULL,
- solverFlags,
- NULL,
- NULL,
- NULL,
- setBuildOpts,
- NULL
-};
-
-static void
-setBuildOpts(
- char * buildOptStr,
- const void *args)
-{
- const SolutionStep *step = (const SolutionStep *)args;
- const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
- KernelExtraFlags kflags = step->extraFlags;
-
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT");
- if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
- #ifdef DEBUG_GEMM_TAIL
- printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
- #endif
- }
-
- if (isComplexType(kargs->dtype))
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
- }
-
- if (kflags & KEXTRA_CONJUGATE_A)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
- }
- if (kflags & KEXTRA_CONJUGATE_B)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
- }
-
-
- switch(kargs->pigFuncID)
- {
- case CLBLAS_GEMM2:
- case CLBLAS_GEMM_TAIL:
- break;
-
- case CLBLAS_HERK:
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
- if(kargs->uplo == clblasLower)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
- }
- else if(kargs->uplo == clblasUpper)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
- }
- break;
-
- case CLBLAS_HEMM:
- case CLBLAS_SYMM_DIAGONAL:
- case CLBLAS_HEMM_DIAGONAL:
- case CLBLAS_SYMM:
- #ifdef DEBUG_GEMM_2
- printf("GEMM2: setBuildOpts: Setting options for SYMM\n");
- #endif
- if (kargs->side == clblasLeft)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
- }
- if (kargs->side == clblasRight)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
- }
- if (kargs->uplo == clblasLower)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
- }
- if (kargs->uplo == clblasUpper)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
- }
- // Define the order for Legacy sake.
- if (kargs->order == clblasColumnMajor)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
- } else {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
- }
- if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
- }
- if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
- }
- break;
-
- default:
- printf("GEMM TAIL: Unknown pigFuncID\n");
- break;
- }
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMMTAIL: Build options = %s\n", buildOptStr);
- #endif
-}
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *subdims,
- const PGranularity *pgran,
- const void *args,
- const void *extra)
-{
- int BLOCKSIZE = pgran->wgSize[0]; // 1D Block
- size_t tailM, tailN, M, N;
- size_t Y, X;
- size_t nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
- size_t nWorkGroupsBY, nWorkGroupsBX, nWorkGroupsB;
- size_t totalWorkGroups;
- #ifdef DEBUG_GEMM_TAIL
- printf("calcNrThreads called from gemm_tail.cpp\n");
- #endif
- const CLBlasKargs *kargs = (const CLBlasKargs *)args;
- const CLBLASKernExtra *kextra = ( CLBLASKernExtra *)extra;
- KernelExtraFlags kflags = kextra->flags;
-
- //
- // RowMajor GEMM can be expressed in terms of Column Major GEMM
- //
- if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
- {
- printf("calcNrThreads: FIXME: RowMajor is NOT supported \n");
- return;
- }
-
- if (kextra->vecLenA != 1)
- {
- printf("GEMM_TAIL: calcNrThreads(): Vector Length must be 1 for TAIL. Non-one Vector Length Requested\n");
- return;
- }
-
- tailM = kargs->tailStartM;
- tailN = kargs->tailStartN;
- M = kargs->M;
- N = kargs->N;
-
- Y = 8;
- if (Y != subdims->y)
- {
- Y = subdims->y;
- }
- X = BLOCKSIZE/Y;
- /*
- LEGACY CODE: Outdated now. TAIL can handle this condition now using MTAIL_PRESENT and NTAIL_PRESENT
- if (tailN % X)
- {
- printf("GEMM_TAIL: calcNrThreads(): WARNING: tailN is not divisible by X. Will produce Wrong results!\n");
- }
- */
-
- //
- // A Tail Workgroup will process YxX panel
- //
- /*
- ______________
- | | |
- | | |
- | | | B Tail panel
- |___________| |
- |___________|__|
- <--- A -->
- */
- if(tailM != M)
- {
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM_TAIL: M has TAIL\n");
- #endif
- nWorkGroupsAY = ((M - tailM -1)/Y + 1);
- nWorkGroupsAX = ((tailN - 1)/X + 1);
- nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
- } else {
- nWorkGroupsA = 0;
- }
-
- if (tailN != N)
- {
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM_TAIL: N has TAIL\n");
- #endif
- nWorkGroupsBY = ((M-1)/Y) + 1;
- nWorkGroupsBX = ((N-tailN-1)/X) + 1;
- nWorkGroupsB = nWorkGroupsBY * nWorkGroupsBX;
- } else {
- nWorkGroupsB = 0;
- }
-
- totalWorkGroups = nWorkGroupsA + nWorkGroupsB;
-
- threads[0] = totalWorkGroups * BLOCKSIZE;
- threads[1] = 1;
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM_TAIL: calcNrThreads(): vlen:%d, <tailM:%lu, M:%lu>, <tailN:%lu, N:%lu, nWorkGroupsA<%lu,%lu>, nWorkGroupsB<%lu,%lu>\n",
- kextra->vecLenA, tailM, M, tailN, N, nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsBY, nWorkGroupsBX);
- printf("GEMM_TAIL: calcNrThreads(): globalThreads0=%lu, globalThreads1=%lu\n", threads[0], threads[1]);
- #endif
- return;
-}
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
- KernelExtraFlags kflags = kextra->flags;
- DataType dtype = kextra->dtype;
- char tempTemplate[32*1024];
- char itemx[10], itemy[10], width[10], itemy_by_width[10], itemx_by_width[10];
- size_t Y, X, BLOCKSIZE, ITEMX, ITEMY;
-
- if (buf == NULL)
- {
- buflen = 32*1024*sizeof(char);
- return (ssize_t)buflen;
- }
-
- //
- // PENDING: Add Support for Row Major at the xAPI.c level
- // Row major calcs can be expressed in terms of column major
- //
- if ((kflags & KEXTRA_COLUMN_MAJOR) == 0)
- {
- return 0;
- }
-
- kprintf kobj(Prefix[dtype], 1, false, false); // Only Scalar Access
-
- BLOCKSIZE = pgran->wgSize[0];
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %d \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLenA);
- #endif
-
- Y = 8;
- if (Y != subdims->y)
- {
- //printf("GEMM_TAIL: generator(): WARNING: subdims->y is un-suitable.\n");
- Y = subdims->y;
- }
- X = BLOCKSIZE/Y;
- ITEMY = (subdims->y) / Y;
- ITEMX = (subdims->x) / X;
- if (ITEMX == 0)
- {
- ITEMX = 1;
- }
-
- if ((BLOCKSIZE % Y) || ((subdims->y) % Y) || ((subdims->x)%X) || (ITEMY % kextra->vecLenA) || ((X*ITEMX) % kextra->vecLenA))
- {
- printf("WARNING: GEMM TAIL - generator: subdim and blocksize in-compatible. This code should never execute!\n");
- }
-
- sprintf(width, "%lu", Y);
- sprintf(itemy, "%lu", ITEMY);
- sprintf(itemx, "%lu", ITEMX);
- sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
- sprintf(itemx_by_width, "%lu", (size_t) ITEMX/kextra->vecLenA);
-
- kobj.put("%WIDTH", width);
- kobj.put("%ITEMX", itemx);
- kobj.put("%ITEMY", itemy);
- kobj.put("%ITEMY_BY_V", itemy_by_width);
- kobj.put("%ITEMX_BY_V", itemx_by_width);
- kobj.put("%PANEL", "1");
- kobj.put("%PANEL_BY_V", "1");
- #ifdef DEBUG_GEMM_TAIL
- printf("ColMajor GEMM - WIDTH = %s, ITEMX = %s, ITEMY = %s\n", width, itemx, itemy);
- #endif
-
- strcpy(tempTemplate, SYMM_HEMM_HELPER);
- if ((kflags & KEXTRA_TRANS_A) == 0)
- {
- if (kflags & KEXTRA_TRANS_B)
- {
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM_TAIL: Using GEMM_NT_KERNEL\n");
- #endif
- strcat(tempTemplate, GEMM_NT_KERNEL);
- } else {
- #ifdef DEBUG_GEMM_TAIL
- printf("GEMM_TAIL: Using GEMM_NN_KERNEL\n");
- #endif
- strcat(tempTemplate, GEMM_NN_KERNEL);
- }
- } else {
- //
- // NOTE: A^T * B Never leaves any tails. This should NEVER be called.
- // PENDING: A^T * B^T support is PENDING
- tempTemplate[0] = 0;
- }
-
- kobj.spit(buf, tempTemplate);
- //#ifdef DEBUG_GEMM_TAIL
- //printf("Kernel = \n%s\n", buf);
- //#endif
- size_t tail = strlen(buf) + 1;
- while(tail < 32*1024)
- {
- buf[tail++] = 0;
- }
- return 32*1024*sizeof(char);
-}
-
-static void
-assignKargs(KernelArg *args, const void *params, const void*)
-{
- CLBlasKargs *blasArgs = (CLBlasKargs*)params;
-
- #ifdef DEBUG_GEMM_TAIL
- printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n",
- blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex),
- CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex));
- printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n",
- blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex),
- CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex));
- printf("TailStartM = %lu, TailStartN = %lu\n", blasArgs->tailStartM, blasArgs->tailStartN);
- #endif
-
- INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument
- INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument
- INIT_KARG(&args[2], blasArgs->C); //y - scratch == _x_vector argument
- initSizeKarg(&args[3], blasArgs->M);
- initSizeKarg(&args[4], blasArgs->N);
- initSizeKarg(&args[5], blasArgs->K);
- initSizeKarg(&args[6], blasArgs->lda.matrix);
- initSizeKarg(&args[7], blasArgs->ldb.matrix);
- initSizeKarg(&args[8], blasArgs->ldc.matrix);
- initSizeKarg(&args[9], blasArgs->offA);
- initSizeKarg(&args[10], blasArgs->offBX);
- initSizeKarg(&args[11], blasArgs->offCY);
- assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype);
- assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype);
- initSizeKarg(&args[14], blasArgs->tailStartM);
- initSizeKarg(&args[15], blasArgs->tailStartN);
- return;
-}
-
-static SolverFlags
-solverFlags(void)
-{
- return (SF_WSPACE_1D);
-}
-
-extern "C"
-void
-initGemmV2TailCachedPattern(MemoryPattern *mempat)
-{
- mempat->name = "Cached global memory based gemm tail";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 1;
- mempat->sops = &gemmSops;
-
- mpatExtra.aMset = CLMEM_LEVEL_L1;
- mpatExtra.bMset = CLMEM_LEVEL_L1;
- mpatExtra.mobjA = CLMEM_BUFFER;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-
-
- Prefix[TYPE_FLOAT] = 'S';
- Prefix[TYPE_DOUBLE] = 'D';
- Prefix[TYPE_COMPLEX_FLOAT] = 'C';
- Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
-}
-