diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/trsv_gemv.cpp')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/trsv_gemv.cpp | 553 |
1 files changed, 0 insertions, 553 deletions
diff --git a/external/clBLAS/src/library/blas/gens/trsv_gemv.cpp b/external/clBLAS/src/library/blas/gens/trsv_gemv.cpp deleted file mode 100644 index ca73fbe5..00000000 --- a/external/clBLAS/src/library/blas/gens/trsv_gemv.cpp +++ /dev/null @@ -1,553 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -/* - * trsv gemv generator - - * - * This generator generates code for the GEMV portion of TRSV. - * The idea is to call this routine after solving a subset of coefficients. - * This generator will help to update the RHS of remaining equations using the - * currently solved variables. - * The current clBLAS implementation of GEMV does not have support complex types. - * Hence, Need to write this kludge. - * One day, this should go away and be completely replaced by existing GEMV - * - * NOTE: - * This generator is highly tied to TRSV and is not a replacement for GEMV. - * In some cases, this generator generates code not only for updating the RHS - * but also for solving the next triangle (trtri based solve) as well. - * We have seen marginal performance increases (1GB/s) by doing so. - * If this is not important, one can replace this with GEMV when GEMV becomes - * feature-complete. - */ - -#include <string.h> -#include <stdio.h> -#include <assert.h> -#include <clblas_stddef.h> -#include <clBLAS.h> -#include <blas_mempat.h> -#include <clkern.h> -#include <clblas-internal.h> -#include <trsv_gemv.clT> -#include <kprintf.hpp> -#include <solution_seq.h> - -//#define DEBUG_TRSV_GEMV - -extern "C" -unsigned int dtypeSize(DataType type); - -static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2) - -static SolverFlags -solverFlags(void) -{ - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV solverFlags(): solverFlags called......\n"); - #endif - - return (SF_WSPACE_1D); -} - -static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT); - -static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, - size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS); - -static void -calcNrThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *extra); - -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra); - - -static void -assignKargs(KernelArg *args, const void *params, const void*); - -extern "C" -void initTrsvGemvDefaultPattern(MemoryPattern *mempat); - -static void -setBuildOpts( - char * buildOptStr, - const void *kArgs); - -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs); - -static SolverOps trsvGemvOps = { - generator, - assignKargs, - isFitToLDS, - NULL, // Prepare Translate Dims - NULL, // Inner Decomposition Axis - calcNrThreads, - NULL, - solverFlags, - NULL, - NULL, - NULL, - setBuildOpts, - NULL -}; - -static void -setBuildOpts( - char * buildOptStr, - const void *args) -{ - const SolutionStep *step = (const SolutionStep *)args; - const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); - if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) - { - addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n"); - #endif - } - if( kargs->pigFuncID == CLBLAS_TPSV) - { - addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); - #ifdef DEBUG_TRSV_GEMV - printf("TPSV GEMV: Setting build options ... PACKED\n"); - #endif - } - return; -} - -static CLBLASMpatExtra mpatExtra; - -extern "C" -void initTrsvGemvDefaultPattern(MemoryPattern *mempat) -{ - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: initTrsvGemvDefaultPattern called with mempat = 0x%p\n", (void*)mempat); - #endif - - mempat->name = "TRSV - GEMV Update Kernel"; - mempat->nrLevels = 2; - mempat->cuLevel = 0; - mempat->thLevel = 1; - mempat->sops = &trsvGemvOps; - - mpatExtra.aMset = CLMEM_LEVEL_L2; - mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; - mpatExtra.mobjA = CLMEM_BUFFER; // == No images - mpatExtra.mobjB = CLMEM_BUFFER; // == No images - mempat->extra = &mpatExtra; - - Prefix[TYPE_FLOAT] = 'S'; - Prefix[TYPE_DOUBLE] = 'D'; - Prefix[TYPE_COMPLEX_FLOAT] = 'C'; - Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; -} - -/* - * Helper function that helps in calculating the "TARGET WIDTH" of - * a block with Block Size needed for the case where - * "theight" number of variables have been solved. - * This is applicable only to NON-TRANSPOSE cases. - */ -static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth) -{ - cl_ulong nLoops_v, nLoops; - // - // NOTE: This function should be called only for Non-Transpose cases - // NOTE: Does not check if the block size is suitable for our purposes - // NOTE: - nLoops_v = (theight * theight) / blk_size; - nLoops = nLoops_v / vwidth; - if (nLoops == 0) - { - return 0; - } - return theight/nLoops; -} - -static void -calcNrThreads( - size_t threads[2], - const SubproblemDim *subdims, - const PGranularity *pgran, - const void *args, - const void *_extra) -{ - size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block - CLBlasKargs *kargs = (CLBlasKargs *)args; - CLBLASKernExtra *extra = (CLBLASKernExtra*) _extra; - size_t blocks; - size_t vecLenA = extra->vecLenA; - - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: calcNrThreads() called \n"); - #endif - - if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) || - ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans))) - { - size_t rowsLeft, TARGETROWS; - - //CL, CU - TARGETROWS = subdims->y; - rowsLeft = kargs->endRow; - blocks = ((rowsLeft-1)/TARGETROWS) + 1; - } else { - size_t TARGETHEIGHT; - if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false) - { - threads[0] =0; threads[1] = 0; - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: calcNrThreads() WARNING: Returning 0\n"); - #endif - return; - } - if ( - ((kargs->uplo == clblasUpper) && (kargs->order == clblasColumnMajor)) || - ((kargs->uplo == clblasLower) && (kargs->order == clblasRowMajor)) - ) - { - blocks = ((kargs->N - kargs->endRow -1) / (BLOCKSIZE / TARGETHEIGHT)) + 1; - } else { - blocks = (kargs->startRow)/(BLOCKSIZE/TARGETHEIGHT) + 1; - } - } - - #ifdef DEBUG_TRSV_GEMV - printf("blocks : %lu\n", blocks); - #endif - threads[0] = blocks * BLOCKSIZE; - threads[1] = 1; - #ifdef DEBUG_TRSV_GEMV - printf("pgran-wgSize[0] : %d, globalthreads[0] : %lu\n", pgran->wgSize[0], threads[0]); - #endif - return; -} - -static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT) -{ - size_t maxHeight; - - if (triangle % vecLen) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isTransposeFeasible(): triangle not multiple of vectorLength\n"); - #endif - return false; - } - maxHeight = triangle/vecLen; - while (blockSize % maxHeight) - { - maxHeight--; - } - // maxHeight at minimum will be 1 - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isTransposeFeasible(): Target Height chosen = %lu\n", maxHeight); - #endif - TARGETHEIGHT = maxHeight; - return true; -} - -/* - * NOTE: - * No-Transpose case - The code iterates along the X direction. Vectoring is along Y Direction. - * Since we dont iterate on Y direction (triangle height), this fixes the "blocky" component of the blocksize. - * The blockSize then determines how much width the block has on X direction and thus the number of loops - * can be calculated from that information. - */ -static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, - size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS) -{ - size_t blockx, blocky, nLoops; - - if ( ((triangle*triangle) % blockSize) != 0) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isNoTransposeFeasible(): triangle*triangle not multiple of blockSize\n"); - #endif - return false; - } - - if (triangle % vecLen) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isNoTransposeFeasible(): triangle not multiple of vectorLength\n"); - #endif - return false; - } - - blocky = triangle/vecLen; - if (blockSize % blocky) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n"); - #endif - return false; - } - blockx = blockSize / blocky; - if (triangle % blockx) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n"); - #endif - return false; - } - nLoops = triangle/blockx; - - TARGETROWS = triangle; - TARGETWIDTH = blockx; - NLOOPS = nLoops; - return true; -} - -// -// FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF -// FIXME: Return correct return value when "buf" is NON NULL - Needs change in KPRINTF -// FIXME: "buflen" check needs to be more accurate. Relies on above changes to KPRINTF -// -static ssize_t -generator( - char *buf, - size_t buflen, - const struct SubproblemDim *subdims, - const struct PGranularity *pgran, - void *extra) -{ - CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; - unsigned int vecLenA = extraFlags->vecLenA; - char tempTemplate[32*1024]; - char TARGETROWS_S[10], NLOOPS_S[10], TARGETWIDTH_S[10]; - size_t TARGETROWS, NLOOPS, TARGETWIDTH; - char TARGETHEIGHT_S[10], BLOCKSIZE_S[10], TRIANGLE_HEIGHT_S[10]; - size_t TARGETHEIGHT; - bool doVLOAD = false; - int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // [1] will always be 1 since we are a 1D implementation - - if (buf == NULL) // PENDING: Return correct buffer size - { - return (32 * 1024 * sizeof(char)); - } - if (buflen > 32*1024) - { - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: generator(): WARNING: Returning 0 as buflen is > 32K\n"); - #endif - return 0; - } - - if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) - { - doVLOAD = true; - #ifdef DEBUG_TRSV_GEMV - printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); - #endif - } - else - { - #ifdef DEBUG_TRSV_GEMV - printf("Using Aligned Data Pointer .........................\n"); - #endif - } - kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); - - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV GENERATOR called....\n"); - #endif - - clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; - clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; - clblasTranspose trans = - (extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); - bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0); - - // unity and doConj handled in setKernelArgs - if ( order == clblasRowMajor ) - { - order = clblasColumnMajor; - if ( trans == clblasNoTrans) - { - trans = clblasTrans; - } - else if ( trans == clblasTrans ) - { - trans = clblasNoTrans; - } - else // clblasConjTrans - { - trans = clblasNoTrans; - } - uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper; - } - - // - // Check Feasibility and then generate the code. - // - if ( trans != clblasNoTrans) - { - if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false) - { - return 0; - } - sprintf( TARGETHEIGHT_S, "%" SPREFIX "u", TARGETHEIGHT ); - sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE ); - sprintf( TRIANGLE_HEIGHT_S, "%" SPREFIX "u", subdims->y ); - - kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S); - kobj.put("%BLOCKSIZE", BLOCKSIZE_S); - kobj.put("%TRIANGLE_HEIGHT", TRIANGLE_HEIGHT_S); - ( uplo == clblasLower )? - (strcpy(tempTemplate, (char*)trsv_CLT_ComputeRectangle_kernel)) : - (strcpy(tempTemplate, (char*)trsv_CUT_ComputeRectangle_kernel)); - - } - else // No-Transpose cases... - { - if (isNoTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETROWS, TARGETWIDTH, NLOOPS) == false) - { - return 0; - } - sprintf( TARGETROWS_S, "%" SPREFIX "u", TARGETROWS ); - sprintf( TARGETWIDTH_S, "%" SPREFIX "u", TARGETWIDTH ); - sprintf( NLOOPS_S, "%" SPREFIX "u", NLOOPS ); - kobj.put("%TARGET_ROWS", TARGETROWS_S); - kobj.put("%TARGET_WIDTH", TARGETWIDTH_S); - kobj.put("%NLOOPS", NLOOPS_S); - if (unit) - { - ( uplo == clblasLower )? - (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_kernel)); - } else { - ( uplo == clblasLower )? - (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_NonUnity_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_NonUnity_kernel)); - } - } - - #ifdef DEBUG_TRSV_GEMV - printf("dataType : %c\n", Prefix[extraFlags->dtype]); - #endif - - // FIXME: VECTORSIZE HARD CODED - // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint - - #ifdef DEBUG_TRSV_GEMV - printf("Vector length used : %d\n\n", vecLenA); - #endif - - kobj.spit((char*)buf, tempTemplate); - return (32 * 1024 * sizeof(char)); -} - -static void -assignKargs(KernelArg *args, const void *params, const void*) -{ - CLBlasKargs *blasArgs = (CLBlasKargs*)params; - cl_int inc; - cl_int unity, doConj; - - INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument - INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument - initSizeKarg(&args[2], blasArgs->N); - inc = blasArgs->ldb.vector; - INIT_KARG(&args[3], inc); - unity = (blasArgs->diag == clblasUnit); - INIT_KARG(&args[4], unity); - initSizeKarg(&args[5], blasArgs->lda.matrix); - doConj = (blasArgs->transA == clblasConjTrans); - #ifdef DEBUG_TRSV_GEMV - printf("TRMV GEMV: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc); - printf("TRMV GEMV: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow); - #endif - INIT_KARG(&args[6], doConj); - INIT_KARG(&args[7], blasArgs->startRow); - INIT_KARG(&args[8], blasArgs->endRow); - initSizeKarg(&args[9], blasArgs->offa); - initSizeKarg(&args[10], blasArgs->offBX); - return; -} - -/* - * isFitToLDS() - * - * 1. We will assume "dim[0].y" as the TRIANGLE_HEIGHT oiow - The number of variables solved - * by the corresponding TRTRI kernel - * - * NOTE: - * 1. It is Possible that this function can cause "dim[0].y" to change from what was used in - * the "trtri" counterpart. - * In such a case, we will detect this in "xtrsv.c" and abort the TRSV call. - * 2. We may need to mellow down the bloated numbers we are returning down here. - */ -static bool -isFitToLDS( - SubproblemDim *dim, - DataType dtype, - cl_ulong ldsSize, - const void *kernelArgs) -{ - CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs; - size_t MAXBLOCKSIZE = 256; - cl_ulong maxSize; - - if ( - ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) || - ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor)) - ) - { - // - // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type? - // - cl_ulong tw; - - tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); - if (tw == 0) - { - do { - MAXBLOCKSIZE /= 2; - tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); - } while((MAXBLOCKSIZE > 1) && (tw == 0)); - } - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isFitLDS() tw = %lu\n", tw); - #endif - maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4; - #ifdef DEBUG_TRSV_GEMV - printf("TRSV GEMV: isFitLDS() maxSize = %lu, ldsSize = %lu, Y = %lu\n", maxSize, ldsSize, dim[0].y); - #endif - return (maxSize < ldsSize); - } - - // - // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS. - // We will assume "dim[0].y" to be the "TriangleWidth" - // - MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y; - maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype); - return (maxSize < ldsSize); -} |