diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/blas_subgroup.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/blas_subgroup.c | 528 |
1 files changed, 0 insertions, 528 deletions
diff --git a/external/clBLAS/src/library/blas/gens/blas_subgroup.c b/external/clBLAS/src/library/blas/gens/blas_subgroup.c deleted file mode 100644 index 9c87d53d..00000000 --- a/external/clBLAS/src/library/blas/gens/blas_subgroup.c +++ /dev/null @@ -1,528 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -#include "blas_subgroup.h" -#include <stdio.h> -#include <clblas_stddef.h> - -#include <matrix_props.h> -#include <matrix_dims.h> -#include <dis_warning.h> - -#include "blas_kgen.h" -#include "gen_helper.h" -#include "tile_iter.h" -#include "kerngen.h" - -static int -calcMergeStepSubgrN( - const BlasGenSettings* pGSet, - DataType dtype); - -static int declareSubgrLDS( - struct KgenContext* pCtx, - const BlasGenSettings* pGSet, - DataType dtype); - -//----------------------------------------------------------------------------- -// calculates best number of subgroups to be engaged in each merge step -// simultaneously -// Calculation is based on the register usage estimation -// in order not to limit -// the number of workgroups scheduled on the SIMD engine -static int -calcMergeStepSubgrN( - const BlasGenSettings* pGSet, - DataType dtype) -{ - // hardware-specific options - const int deviceLDS = 32768; - const unsigned int gprsPerUnit = 240; - - int vecLenA = 0; - int vecLenB = 0; - int vecLenC = 0; - - int vecNumA = 0; - int vecNumB = 0; - int vecNumC = 0; - - int subgPerStep = 0; - int bestLDS = 0; - int gprsUsed = 0; - int subgNum = 0; - - int itemsPerSubgroup = 0; - - if( NULL == pGSet || NULL == pGSet->pgran ){ - return -EINVAL; - } - - itemsPerSubgroup = pGSet->subdims[0].bwidth/ - pGSet->subdims[1].bwidth; - - subgNum = (pGSet->subdims[0].x/pGSet->subdims[1].x)* - (pGSet->subdims[0].y/pGSet->subdims[1].y); - - vecLenA = pGSet->tileA.vecLen; - vecLenB = pGSet->tileBX.vecLen; - vecLenC = pGSet->tileCY.vecLen; - - vecNumA = tileVectorsNum( &pGSet->tileA ); - vecNumB = tileVectorsNum( &pGSet->tileBX ); - vecNumC = tileVectorsNum( &pGSet->tileCY ); - - // registers hold 4-vectors of 32-bit floats or 2-vectors of doubles - switch(dtype){ - - case TYPE_FLOAT: - - // each register holds 4 4-byte float values - // 10 registers are used address, etc - gprsUsed = vecNumA * (vecLenA/4) + - vecNumB * (vecLenB/4) + - vecNumC * (vecLenC/4) + 10; - - bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); - - subgPerStep = bestLDS/(itemsPerSubgroup * - vecNumC * - vecLenC * 4 );//4-byte floats - break; - - case TYPE_DOUBLE: - - // each register can hold 2 double values - // 10 registers are used for address, etc - gprsUsed = vecNumA * (vecLenA/2) + - vecNumB * (vecLenB/2) + - vecNumC * (vecLenC/2) + 10; - - bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); - - subgPerStep = bestLDS/(itemsPerSubgroup * - vecNumC * - vecLenC * 8 );//8-byte doubles - break; - - case TYPE_COMPLEX_FLOAT: - - // each register holds 2 4-byte float-based complex values - // 10 registers are used address, etc - gprsUsed = vecNumA * (vecLenA/2) + - vecNumB * (vecLenB/2) + - vecNumC * (vecLenC/2) + 10; - - bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); - - subgPerStep = bestLDS/(itemsPerSubgroup * - vecNumC * - vecLenC * 8 );//2x4-byte floats - break; - - case TYPE_COMPLEX_DOUBLE: - - // each register can hold 1 double-based complex value - // 10 registers are used for address, etc - gprsUsed = vecNumA * (vecLenA) + - vecNumB * (vecLenB) + - vecNumC * (vecLenC) + 10; - - bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); - - subgPerStep = bestLDS/(itemsPerSubgroup * - vecNumC * - vecLenC * 16 );//2x8-byte double - break; - - } - - if( 0==subgPerStep ){ - subgPerStep = 1; - } - - // do not exceed physical number of subgroups in workgroup - if( subgPerStep > subgNum ){ - subgPerStep = subgNum; - } - - return subgPerStep; -} - -//----------------------------------------------------------------------------- -// Add LDS array declaration(based on C matrix parameters) to the context -// each row of C Matrix block may be splitted into separate vectors - -static int declareSubgrLDS( - struct KgenContext* pCtx, - const BlasGenSettings* pGSet, - DataType dtype) -{ - int vecLenC = 0; - int vecNumC = 0; - const char* typeName; - const KernelVarNames *vnames = NULL; - char tmp[512]; - int itemsPerSubgroup = 0; - int subgrPerStep = 0; - - if( NULL == pCtx || NULL == pGSet ){ - return -EINVAL; - } - - itemsPerSubgroup = pGSet->subdims[0].bwidth / pGSet->subdims[1].bwidth; - subgrPerStep = calcMergeStepSubgrN(pGSet, dtype); - - vecLenC = pGSet->tileCY.vecLen; - vecNumC = tileVectorsNum( &pGSet->tileCY ); - typeName = dtypeBuiltinType(dtype); - vnames = &pGSet->varNames; - - switch(dtype){ - - case TYPE_FLOAT: - case TYPE_DOUBLE: - - if( vecLenC > 1){ - sprintf( - tmp, - "__local %s%d a%s[%d*%d*%d];\n" - "__local %s%d *%s = a%s;\n", - typeName, - vecLenC, - vnames->LDS, - itemsPerSubgroup, - subgrPerStep, - vecNumC, - typeName, - vecLenC, - vnames->LDS, - vnames->LDS); - } - else{ - sprintf( - tmp, - "__local %s a%s[%d*%d*%d];\n" - "__local %s *%s = a%s;\n", - typeName, - vnames->LDS, - itemsPerSubgroup, - subgrPerStep, - vecNumC, - typeName, - vnames->LDS, - vnames->LDS); - } - - break; - - case TYPE_COMPLEX_FLOAT: - - sprintf( - tmp, - "__local float%d a%s[%d*%d*%d];\n" - "__local float%d *%s = a%s;\n", - vecLenC*2, - vnames->LDS, - itemsPerSubgroup, - subgrPerStep, - vecNumC, - vecLenC*2, - vnames->LDS, - vnames->LDS); - - break; - - case TYPE_COMPLEX_DOUBLE: - - sprintf( - tmp, - "__local double%d a%s[%d*%d*%d];\n" - "__local double%d *%s = a%s;\n", - vecLenC*2, - vnames->LDS, - itemsPerSubgroup, - subgrPerStep, - vecNumC, - vecLenC*2, - vnames->LDS, - vnames->LDS); - - break; - - } - - kgenAddStmt( pCtx, tmp ); - - return 0; -} - -//----------------------------------------------------------------------------- - -int -mergeUpdateResult( struct KgenContext* pCtx, - BlasFunctionID funcID, - struct BlasGenSettings* pGSet, - struct SubgVarNames* pSubgVNames, - UpdateResultFlags upResFlags, - UpresProcPtr upresProcPtr ) -{ - char tmp[2048]; - int subgN = 0; - int subgItems = 0; - int aBlkH = 0; - DataType dtype; - Tile tileC; - Tile tileScratch; - KernelVarNames* pVNames; - unsigned int vecLenC; - unsigned int vecNumC; - - int subgPerStep = 0; - - if( NULL == pCtx || NULL == pGSet ){ - return -EINVAL; - } - - dtype = pGSet->kextra->dtype; - subgN = ( pGSet->subdims[0].x/pGSet->subdims[1].x ) * - ( pGSet->subdims[0].y/pGSet->subdims[1].y ); - - subgItems = pGSet->subdims[0].bwidth/ - pGSet->subdims[1].bwidth; - - aBlkH = pGSet->subdims[1].y; - pVNames = &pGSet->varNames; - - // calculate best number of subgroups to be engaged in each merge step - subgPerStep = calcMergeStepSubgrN( pGSet, dtype ); - - vecLenC = pGSet->tileCY.vecLen; - vecNumC = tileVectorsNum( &pGSet->tileCY ); - - kgenAddStmt(pCtx,"//-----MergeUpdateResult\n"); - kgenAddBlankLine(pCtx); - - // declare local data storage array - kgenAddStmt( pCtx, "// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]\n"); - declareSubgrLDS( pCtx, - pGSet, - dtype); - - kgenAddBlankLine( pCtx ); - - kgenAddStmt(pCtx, - "//LDS block has the same vectorization as C matrix block\n"); - kgenAddStmt( - pCtx, - "//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS" - " +get_local_id(0) );\n"); - - sprintf(tmp, - "scratch += " - "%d*(" - "(%s.y%%%d)*%d +" - "%s.x );\n", - vecNumC, - pSubgVNames->itemId, - subgPerStep, - subgItems, - pSubgVNames->itemId ); - kgenAddStmt(pCtx, tmp); - - - sprintf( - tmp, - "\nfor( uint mstep = 0; mstep < %d; mstep += %d )", - subgN, - subgPerStep); - kgenBeginBranch(pCtx,tmp); - kgenAddBlankLine(pCtx); - - sprintf( - tmp, - "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )", - pSubgVNames->itemId, - pSubgVNames->itemId, - subgPerStep); - kgenBeginBranch(pCtx,tmp); - - // the LDS block size is similar to C matrix block size - kgenAddBlankLine(pCtx); - initTile(&tileC, - "c", - (unsigned int)pGSet->subdims[1].y, - (unsigned int)pGSet->subdims[1].x, - vecLenC, - dtype, - pGSet->tileCY.storType, - pGSet->tileCY.trans, - pGSet->tileCY.packed); - - initTile(&tileScratch, - "scratch", - (unsigned int)pGSet->subdims[1].y, - (unsigned int)pGSet->subdims[1].x, - vecLenC, - dtype, - PRIV_STORAGE_ARRAY, - pGSet->tileCY.trans, - pGSet->tileCY.packed); - - genTileCopy(pCtx, - &tileScratch, - &tileC, - TILECOPY_ASSIGN); - - genZeroTile(pCtx, - &tileC); - - // split merge if - kgenEndBranch( pCtx, NULL ); // merge step if - kgenAddBlankLine( pCtx ); - - //splitting if on two, to prevent barrier issue - kgenAddBarrier( pCtx, CLK_LOCAL_MEM_FENCE ); - kgenAddBlankLine( pCtx ); - //---------------------------------------------- - - sprintf( tmp, - "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )", - pSubgVNames->itemId, - pSubgVNames->itemId, - subgPerStep); - kgenBeginBranch(pCtx,tmp); - - sprintf( tmp, - "if ( 0 == %s.x )", - pSubgVNames->itemId ); - kgenBeginBranch( pCtx, tmp ); - - kgenAddBlankLine(pCtx); - - // Zero element of each subgroup also performs LDS merge - sprintf( - tmp, - "for(uint k = 0; k < %d * %d; k += %d)", - subgItems, - aBlkH, - aBlkH); - - kgenBeginBranch(pCtx, tmp); - kgenAddBlankLine(pCtx); - - genTileCopy(pCtx, - &tileC, - &tileScratch, - TILECOPY_ADD_ASSIGN ); - kgenAddStmt(pCtx, - "//Adding the LDS block size in vectors\n"); - sprintf(tmp, - "%s += %d;", - pVNames->LDS, - vecNumC); - kgenAddStmt(pCtx, tmp); - kgenAddBlankLine(pCtx); - - kgenEndBranch( pCtx, NULL ); // merge for() - kgenAddBlankLine( pCtx ); - - // Write into global memory ------------------------------- - if ( NULL != upresProcPtr ) { - - (*upresProcPtr)( pCtx, - funcID, - pGSet, - upResFlags /*| UPRES_INDEXING_WITH_CONSTANTS*/, - NULL, - NULL, - NULL ); - } - - kgenAddBlankLine(pCtx); - - kgenEndBranch(pCtx, NULL); // merge and global write if - kgenEndBranch(pCtx, NULL); // LDS write if - - kgenAddBarrier(pCtx, CLK_LOCAL_MEM_FENCE); - //LDS write for - kgenEndBranch(pCtx, NULL); - - - return 0; -} - -//----------------------------------------------------------------------------- - -int -subgGetDefaultDecomp( - PGranularity *pgran, - SubproblemDim *subdims, - void* pArgs ) -{ - int itemsPerSubg = 8; - int subgA = 4; - int subgB = 2; - - int bw1 = 8; - int x1 = 4; - int y1 = 4; - CLBlasKargs *kargs; - - if ( NULL == pArgs ) { - return -EINVAL; - } - - kargs = (CLBlasKargs *)pArgs; - - if( isComplexType(kargs->dtype) ){ - bw1 /= 2; - } - if( isDoubleBasedType(kargs->dtype) ){ - bw1 /= 2; - } - - subdims[1].bwidth = bw1; - subdims[1].x = subdims[1].itemX = x1; - subdims[1].y = subdims[1].itemY = y1; - - subdims[0].bwidth = bw1 * itemsPerSubg; - subdims[0].itemX = x1 * subgB; - subdims[0].x = x1*subgB; - - subdims[0].itemY = y1*subgA; - subdims[0].y = y1*subgA; - - switch ( pgran->wgDim ) { - - case 1: - pgran->wgSize[0] = 64; - pgran->wgSize[1] = 1; - break; - - case 2: - pgran->wgSize[0] = itemsPerSubg; - pgran->wgSize[1] = 64/itemsPerSubg; - break; - - default: - pgran->wgSize[0] = 64; - pgran->wgSize[1] = 1; - break; - } - - return 0; -}
\ No newline at end of file |