summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/ger_lds.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/ger_lds.cpp')
-rw-r--r--external/clBLAS/src/library/blas/gens/ger_lds.cpp414
1 files changed, 0 insertions, 414 deletions
diff --git a/external/clBLAS/src/library/blas/gens/ger_lds.cpp b/external/clBLAS/src/library/blas/gens/ger_lds.cpp
deleted file mode 100644
index f72d1975..00000000
--- a/external/clBLAS/src/library/blas/gens/ger_lds.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-/*
- * ger generator
- */
-//#define DEBUG_GER
-
-#include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <clblas_stddef.h>
-#include <clBLAS.h>
-#include <blas_mempat.h>
-#include <clkern.h>
-#include <clblas-internal.h>
-#include "blas_kgen.h"
-
-#include <kprintf.hpp>
-#include <ger.clT>
-#include <solution_seq.h>
-
-extern "C"
-unsigned int dtypeSize(DataType type);
-
-
-static char Prefix[4];
-
-static int
-getDefaultDecomposition(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- void *pArgs);
-
-static SolverFlags
-solverFlags(void)
-{
- #ifdef DEBUG_GER
- printf("solverFlags callen......\n");
- #endif
-
- return (SF_WSPACE_1D);
-}
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *subdims,
- const PGranularity *pgran,
- const void *args,
- const void *extra);
-
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra);
-
-
-static void
-assignKargs(KernelArg *args, const void *params, const void* );
-
-extern "C"
-void initGerRegisterPattern(MemoryPattern *mempat);
-
-static KernelExtraFlags
-selectVectorization(
- void *kargs,
- unsigned int vlen );
-
-static void
-setBuildOpts(
- char * buildOptStr,
- const void *kArgs);
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs);
-
-static SolverOps gerOps = {
- generator,
- assignKargs,
- isFitToLDS,
- NULL, // Prepare Translate Dims
- NULL, // Inner Decomposition Axis
- calcNrThreads,
- NULL, // Related to images
- solverFlags,
- NULL,
- getDefaultDecomposition,
- NULL,
- setBuildOpts,
- selectVectorization
-};
-
-static KernelExtraFlags
-selectVectorization(
- void *args,
- unsigned int vlen )
-{
- KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
- CLBlasKargs *kargs = (CLBlasKargs *)args;
-
- if(((kargs->lda.matrix) % vlen) != 0)
- {
- kflags = KEXTRA_NO_COPY_VEC_A;
- }
-
- return kflags;
-}
-
-static void
-setBuildOpts(
- char * buildOptStr,
- const void *args)
-{
- const SolutionStep *step = (const SolutionStep *)args;
- const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
- if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
- {
- addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
-
- #ifdef DEBUG_GER
- printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
- #endif
- }
- return;
-}
-
-static CLBLASMpatExtra mpatExtra;
-
-extern "C"
-void initGerRegisterPattern(MemoryPattern *mempat)
-{
- mempat->name = "Register accumulation based ger";
- mempat->nrLevels = 2;
- mempat->cuLevel = 0;
- mempat->thLevel = 1;
- mempat->sops = &gerOps;
-
- //CHECK THIS
- mpatExtra.aMset = CLMEM_LEVEL_L2;
- mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector
- mpatExtra.mobjA = CLMEM_BUFFER;
- mpatExtra.mobjB = CLMEM_BUFFER;
- mempat->extra = &mpatExtra;
-
- Prefix[TYPE_FLOAT] = 'S';
- Prefix[TYPE_DOUBLE] = 'D';
- Prefix[TYPE_COMPLEX_FLOAT] = 'C';
- Prefix[TYPE_COMPLEX_DOUBLE] = 'Z';
-
- #ifdef DEBUG_GER
- printf("initGerRegPattern called with mempat = 0x%p\n", mempat);
- fflush(stdout);
- #endif
-}
-
-static void
-calcNrThreads(
- size_t threads[2],
- const SubproblemDim *subdims,
- const PGranularity *pgran,
- const void *args,
- const void *_extra)
-{
- const CLBlasKargs *kargs = (const CLBlasKargs *)args;
- const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
- size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block
- size_t BH, BW;
- unsigned int VEC_LEN = extra->vecLenA;
-
- clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
-
- size_t nBlocksY; //number of blocks in Y dir ( Although we say 1D block to opencl )
- size_t nBlocksX; //number of blocks in X dir
-
- BH = subdims->y;
- BW = subdims->x;
-
- if ( order == clblasColumnMajor )
- {
- nBlocksY = ( kargs->M + BH*VEC_LEN - 1 ) / (BH*VEC_LEN);
- nBlocksX = ( kargs->N + BW - 1) / BW;
- }
- else
- {
- nBlocksY = ( kargs->M + BH - 1) / BH;
- nBlocksX = ( kargs->N + BW*VEC_LEN - 1) / (BW*VEC_LEN);
- }
- size_t blocks = nBlocksX * nBlocksY;
- threads[0] = blocks * BLOCKSIZE;
- threads[1] = 1;
-
- #ifdef DEBUG_GER
- printf("calcNrThreads called from GER_Reg.cpp.. wgSize[0]: %u\twgSize[1]: %u\n", pgran->wgSize[0], pgran->wgSize[1]);
- printf("subdim->y :%u\t subdim->x : %u\n", subdims->y, subdims->x);
- printf("kargs-> M : %d, kargs-> N: %d, BH: %d, BW: %d\n", kargs->M, kargs->N, BH, BW);
- printf("blocks : %d\tglobalthreads[0] : %u\t VecLen :%d\n", blocks, threads[0], VEC_LEN);
- #endif
-
-}
-
-//
-// FIXME: Report correct return value - Needs change in KPRINTF
-//
-static ssize_t
-generator(
- char *buf,
- size_t buflen,
- const struct SubproblemDim *subdims,
- const struct PGranularity *pgran,
- void *extra)
-{
- size_t BH, BW;//BLOCKSIZE = pgran->wgSize[0]; // Because we are using 1D block
- unsigned int VEC_LEN;
- char tempTemplate[32*1024];
- char bhStr[10], bwStr[10];
-
-
- pgran = pgran; // To remove warnings
- if ( buf == NULL) // return buffer size
- {
- buflen = (64 * 1024 * sizeof(char));
- return (ssize_t)buflen;
- }
- CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;
- clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor;
- VEC_LEN = extraFlags->vecLenA;
-
- #ifdef DEBUG_GER
- printf("GER GENERATOR called.... with %s order, DataType %c & Vector-Length: %d\n",
- ((order == clblasColumnMajor)? "ColumnMajor": "RowMajor"), Prefix[extraFlags->dtype], VEC_LEN );
- #endif
-
- if( order == clblasColumnMajor )
- {
- strcpy( tempTemplate, (char*)ger_C_kernel );
- }
- else
- {
- strcpy( tempTemplate, (char*)ger_R_kernel );
- }
-
- // FIXME: VECTORSIZE HARD CODED
- // FIXME: SetKernelArgs.. sends offa, offx, and lda should be received as uint
-
- bool doVLOAD = false;
- if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A )
- {
- doVLOAD = true;
- #ifdef DEBUG_GER
- printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
- #endif
- }
- else
- {
- #ifdef DEBUG_GER
- printf("Using Aligned Data Pointer .........................\n");
- #endif
- }
- kprintf kobj( Prefix[extraFlags->dtype], VEC_LEN, doVLOAD, doVLOAD);
-
- BH = subdims->y;
- BW = subdims->x;
- sprintf( bhStr, "%" SPREFIX "u", BH );
- sprintf( bwStr, "%" SPREFIX "u", BW );
-
- #ifdef DEBUG_GER
- printf("BH = %s\n", bhStr);
- printf("BW = %s\n", bwStr);
- #endif
-
- kobj.put("%BH_DEF", (const char *)bhStr);
- kobj.put("%BW_DEF", (const char *)bwStr);
- kobj.spit((char*)buf, tempTemplate);
-
-
- return (64 * 1024 * sizeof(char));
- // return 0;//(ret < 0) ? -EOVERFLOW : ret;
-}
-
-/*
- ( __global const %TYPE* X, __global const %TYPE* Y, __global %TYPE* A,
- uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda,
- %TYPE alpha, int doConj )
-*/
-
-static void
-assignKargs(KernelArg *args, const void *params, const void*)
-{
- CLBlasKargs *blasArgs = (CLBlasKargs*)params;
- cl_int incx, incy, doConj;
-
- INIT_KARG(&args[0], blasArgs->B); // B - our X vector
- INIT_KARG(&args[1], blasArgs->C); // C - our Y vector
- INIT_KARG(&args[2], blasArgs->A); // A - matrix A
- initSizeKarg(&args[3], blasArgs->M);
- initSizeKarg(&args[4], blasArgs->N);
-
- incx = blasArgs->ldb.vector;
- incy = blasArgs->ldc.vector;
- initSizeKarg(&args[5], blasArgs->offBX);
- INIT_KARG(&args[6], incx);
- initSizeKarg(&args[7], blasArgs->offCY);
- INIT_KARG(&args[8], incy);
- initSizeKarg(&args[9], blasArgs->offa);
- initSizeKarg(&args[10], blasArgs->lda.matrix);
-
- assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype);
- doConj = (cl_int)(blasArgs->K);
- INIT_KARG(&args[12], doConj); // K was used as doConj
-
- #ifdef DEBUG_GER
- printf("doConj = %d\n", doConj );
- #endif
-
- return;
-}
-
-static bool
-isFitToLDS(
- SubproblemDim *dim,
- DataType dtype,
- cl_ulong ldsSize,
- const void *kernelArgs)
-{
- const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
- SolutionStep *step = container_of(kargs, args, SolutionStep);
- unsigned int vecLen;
- vecLen = ((CLBLASKernExtra*)(step->kernels[CLBLAS_COMPUTING_KERNEL]->extra))->vecLenA;
-
- cl_ulong maxSize;
-
- if( kargs->order == clblasColumnMajor ) {
- maxSize = ( dim[0].x + (dim[0].y * vecLen) ) * sizeof(dtype);
- } else {
- maxSize = ( (dim[0].x * vecLen) + dim[0].y ) * sizeof(dtype);
- }
- return ( maxSize <= ldsSize );
-}
-
-static int
-getDefaultDecomposition(
- PGranularity *pgran,
- SubproblemDim *subdims,
- unsigned int subdimsNum,
- void *pArgs)
-{
- SolutionStep *step = container_of( pgran , pgran, SolutionStep);
- size_t maxWorkGroupSize;
- cl_device_id devID = step->device.id;
- size_t wgX, wgY;
- pArgs = pArgs;
-
- clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
- sizeof(size_t), &maxWorkGroupSize, NULL);
-
- if( step->args.order == clblasColumnMajor )
- {
- wgY = 16; // BH preferably 16(quarter wave-front)
- subdims[0].y = wgY;
- wgX = maxWorkGroupSize / wgY; // BW is left upto maxWorkGroupSize of the device
- wgX = szmin( wgX, 16 );
- subdims[0].x = wgX;
- }
- else {
- wgX = 16;
- subdims[0].x = wgX;
- wgY = maxWorkGroupSize / wgX;
- wgY = szmin( wgY, 16 );
- subdims[0].y = wgY;
- }
-
- pgran->wgDim = 1; //1D blocking
- pgran->wgSize[0] = (unsigned int)(wgX * wgY);
- pgran->wgSize[1] = 1;
-
- if(subdimsNum > 0)
- {
- subdims[0].itemX = subdims[0].x;
- subdims[0].itemY = subdims[0].y;
- subdims[0].bwidth = 1;
- }
- if(subdimsNum > 1)
- {
- subdims[1].itemY = 1;
- subdims[1].itemX = 1;
- subdims[1].y = subdims[1].itemY;
- subdims[1].x = subdims[1].itemX;
- subdims[1].bwidth = 1;
- }
-
- return 0;
-}