diff options
Diffstat (limited to 'external/clBLAS/src/library/tools/tune/tune.c')
-rw-r--r-- | external/clBLAS/src/library/tools/tune/tune.c | 2712 |
1 files changed, 0 insertions, 2712 deletions
diff --git a/external/clBLAS/src/library/tools/tune/tune.c b/external/clBLAS/src/library/tools/tune/tune.c deleted file mode 100644 index b6174c4e..00000000 --- a/external/clBLAS/src/library/tools/tune/tune.c +++ /dev/null @@ -1,2712 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - - -#include <math.h> -#include <string.h> -#include <stdlib.h> -#include <stdio.h> -#include <assert.h> -#ifdef __APPLE__ -#include <OpenCL/cl.h> -#else -#include <CL/cl.h> -#endif - -// #include "fileio.h" -#include "toolslib.h" -#include "tune.h" -#include "devinfo.h" -#include "assert.h" -#include "solution_seq.h" -#include "matrix_dims.h" - -#include "subdim.h" - -#if defined(_MSC_VER) -#include "Windows.h" -#elif defined(__APPLE__) -#include <stdint.h> -#include <mach/mach.h> -#include <mach/mach_time.h> -#else -#include "time.h" -#endif - -#define EXIT_COD_OK 0x0000 -#define EXIT_COD_CL_ERROR 0x0100 -#define EXIT_COD_UNKNOWN_DATATYPE 0x0101 -#define EXIT_COD_NO_DATA 0x0102 -#define EXIT_COD_NO_ENVIRONMENT_VARIABLE 0x0103 -#define EXIT_COD_BAD_ENVIRONMENT_VARIABLE 0x0104 - -#define TYPE_NUMBER 4 -#define MAX_RUN_KERNEL 3 - -typedef int KMASK; - -////////////////////////////////////////////////////////////////// -#if defined(_MSC_VER) - -typedef unsigned long long nano_time_t; -#define NANOTIME_MAX (~0ULL - 1) - -#define fmin min -#define fmax max - -nano_time_t -conv2nanosec(nano_time_t t) -{ - LARGE_INTEGER count; - - if (QueryPerformanceFrequency(&count) == FALSE) { - return 0; - } - t = (t * 1000000)/count.QuadPart; - - return (nano_time_t)(t * 1000); -} - -nano_time_t -getCurrentTime(void) -{ - LARGE_INTEGER count; - - if (QueryPerformanceCounter(&count) == FALSE) { - return 0; - } - return (nano_time_t)count.QuadPart; -} - -#elif defined(__APPLE__) - -typedef uint64_t nano_time_t; -#define NANOTIME_MAX UINT64_MAX - -nano_time_t -conv2nanosec(nano_time_t t) -{ - static mach_timebase_info_data_t timebase_info = {0}; - - if (timebase_info.denom == 0) - { - (void)mach_timebase_info(&timebase_info); - } - - /* Let's hope we don't overflow */ - return (t * timebase_info.denom) / timebase_info.numer; -} - -nano_time_t -getCurrentTime(void) -{ - return mach_absolute_time(); -} - -#else - -typedef unsigned long nano_time_t; -#define NANOTIME_MAX (~0UL - 1) - -nano_time_t -conv2nanosec(nano_time_t t) -{ - /* clock_... functions measure time in nanoseconds */ - return t; -} - -nano_time_t -getCurrentTime(void) -{ - int err; - struct timespec t; - - err = clock_gettime(CLOCK_REALTIME, &t); - if (err == 0) { - return (t.tv_sec * 1000000000UL + t.tv_nsec); - } - return 0; -} - -#endif /* defined(_MCS_VER) */ -////////////////////////////////////////////////////////////////// - -cl_int -waitForSuccessfulFinish( - cl_command_queue commandQueues, - cl_event *event) -{ - cl_int err, status; - - err = clFinish(commandQueues); - if (err != CL_SUCCESS) { - return err; - } - - if (event == NULL || *event == NULL) { - return CL_SUCCESS; - } - - status = CL_COMPLETE; - err = clGetEventInfo(*event, CL_EVENT_COMMAND_EXECUTION_STATUS, - sizeof(status), &status, NULL); - if (err != CL_SUCCESS) { - return err; - } - if (status < 0) { - return -status; - } - return CL_SUCCESS; -} - -cl_int -flushAll(cl_command_queue commandQueue) -{ - cl_int err; - - err = clFlush(commandQueue); - if (err != CL_SUCCESS) { - return err; - } - return CL_SUCCESS; -} - - -enum { - MASK_KERNEL_COMP = 0x01, - MASK_KERNEL_A = 0x02, - MASK_KERNEL_B = 0x04 -}; - -const char *FILE_PATH = NULL; - -FILE* logStream; - -int globalDim = 0; -enum { - DEVNAME_MAXLEN = 64 -}; - -#ifdef TEST_LOG -#include <time.h> -typedef unsigned long nano_time_t; -nano_time_t -getCurrentTime(void) -{ - int err; - struct timespec t; - - - err = clock_gettime(CLOCK_REALTIME, &t); - if (err == 0) { - return (t.tv_sec * 1000000000UL + t.tv_nsec); - } - return 0; -} -double globalTime = 0; -double globalFastTime = 0; - -#endif - -extern int getDataTypeSize(DataType dataType); -extern void writeStorageCache(TargetDevice* devID); -extern BlasFunctionInfo* getBlasFunctionInfo(TargetDevice* devID, int func); -extern void checkFILE(TargetDevice* devID, BlasFunctionInfo* fiArr); -extern char* getDevName(TargetDevice* tdev); - -const unsigned int uiNONE = (unsigned int)-1; - -// float types based unified pointer -typedef union FPtr { - void *v; - cl_float *f; - cl_double *d; - cl_float2 *f2; - cl_double2 *d2; -} FPtr; - - - -typedef struct GParam { - - int count; - char name[65]; - SubproblemDim dims[MAX_SUBDIMS]; - PGranularity pgran; - unsigned int vecLen; - - cl_ulong time; - // For each kernel the binaries are created - Kernel* kernel; - //cl_kernel clkern; - size_t binary_sizes; - char* binaries; - - Kernel *kernelPrepA; - Kernel *kernelPrepB; - - size_t binary_sizesA; - char* binariesA; - - size_t binary_sizesB; - char* binariesB; - -// POSFILE fbin[MAX_CLBLAS_KERNELS_PER_STEP]; -} GParam; - -typedef struct MatrixInfo { - DataType dtype; - unsigned int sizeDType; - - unsigned int M; - unsigned int N; - unsigned int K; - - cl_mem clA; - cl_mem clB; - cl_mem clC; - - FPtr A; - FPtr B; - FPtr C; - - cl_mem clImgA; - cl_mem clImgB; - void *imgA; - void *imgB; -}MatrixInfo; - -enum Command { - C_DEFAULT, - C_REBUILD, - C_GENKERNEL, - C_ADD, -}; - -struct GeneratorInfoRec { - - cl_platform_id platform; // ID of platform - cl_device_type devType; - cl_context ctx; - cl_command_queue queue; - // - unsigned int numDevices; // number of Devices - - TargetDevice targetDevice; // - DeviceInfo deviceInfos; // Todo delete this member. Use TargetDevice. - char *deviceName; // - - bool aFunc[BLAS_FUNCTIONS_NUMBER]; // True/false value if the corresponding function should be tuned - int aPattern; - bool aDType[TYPE_NUMBER]; // True false value if the precision should be tuned; s/d/c/z - int aFlag; - int aCommand; - bool aIsKernel; // True/false value to store binary kernels into the kernel database - int aMaxparam; - bool aExtendedOutput; - bool aAll; - - double next; - double last; - const char* patternName; -} genInfo; - -char * -genParamStr(char* name, int w, size_t data) -{ - char format[5]; - - sprintf(format,"%%%uu ", w); - if (data != (size_t)-1) { - char format[5]; - - sprintf(format,"%%%uu ", w); - sprintf(name, format, (unsigned)data); - } - else { - char format[5]; - - sprintf(format,"%%%us ", w); - sprintf(name, format, "SU"); - } - return name + w + 1; -} - -char * -genParamsStr(SubproblemDim* dim, char* name, int w) -{ - char* n = name; - - n = genParamStr(n, w, dim->x); - n = genParamStr(n, w, dim->y); - n = genParamStr(n, w, dim->bwidth); - sprintf(n,":"); - return n + 1; -} - -void -createGParamName(GParam* param) -{ - char* n = param->name; - - if (param->dims[2].itemX > 0) { - n = genParamsStr(¶m->dims[0], n, 3); - n = genParamsStr(¶m->dims[1], n, 3); - n = genParamsStr(¶m->dims[2], n, 2); - } - else { - n = genParamsStr(¶m->dims[0], n, 3); - n = genParamsStr(¶m->dims[1], n, 2); - } - - sprintf(n,"%3dx%-2d", param->pgran.wgSize[0], - param->pgran.wgSize[1]); -} - -static int -patternUseImages(MemoryPattern *pattern) -{ - const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra; - - if (extra == NULL) { - return 0; - } - if ((extra->mobjA == CLMEM_IMAGE) || - (extra->mobjB == CLMEM_IMAGE) ) { - return 1; - } - return 0; -} - - -void -initGeneratorInfoRec(void) -{ - int i; - - memset(&genInfo, 0, sizeof(struct GeneratorInfoRec)); - genInfo.devType = CL_DEVICE_TYPE_GPU; - - genInfo.aCommand = C_DEFAULT; - for (i=0; i < TYPE_NUMBER; ++i) { - genInfo.aDType[i] = false; - } - genInfo.aFlag = -1; - for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) { - genInfo.aFunc[i] = false; - } - genInfo.aPattern = -1; - genInfo.aIsKernel = false; - genInfo.aMaxparam = 5000; - - genInfo.aExtendedOutput = false; -} - -void -destroyKernels(GParam *param) -{ - if (param->kernel != NULL) { - putKernel(NULL, param->kernel); - param->kernel = NULL; - } - if (param->kernelPrepA != NULL) { - putKernel(NULL, param->kernelPrepA); - param->kernelPrepA = NULL; - } - if (param->kernelPrepB != NULL) { - putKernel(NULL, param->kernelPrepB); - param->kernelPrepB = NULL; - } -} - -void -destroyGenInfo(void) -{ - free (genInfo.deviceName); - genInfo.deviceName = NULL; - clReleaseCommandQueue(genInfo.queue); - clReleaseContext(genInfo.ctx); -// destroyData(genInfo.functionInfo); -} - -void -checkErrorFunc(char* funcName, cl_int status) -{ - if (status != CL_SUCCESS) { - char * ret = "UNKNOWN"; - - switch (status) { - case CL_OUT_OF_RESOURCES: - ret = "CL_OUT_OF_RESOURCES"; // -5 - break; - case CL_BUILD_PROGRAM_FAILURE: // -11 - ret = "CL_BUILD_PROGRAM_FAILURE"; - break; - case CL_INVALID_VALUE: // - 30 - ret = "CL_INVALID_VALUE"; - break; - case CL_INVALID_KERNEL_ARGS: // - 52 - ret = "CL_INVALID_KERNEL_ARGS"; - break; - case CL_INVALID_WORK_GROUP_SIZE: // - 54 - ret = "CL_INVALID_WORK_GROUP_SIZE"; - break; - case CL_INVALID_WORK_ITEM_SIZE: // - 55 - ret = "CL_INVALID_WORK_ITEM_SIZE"; - break; - case CL_INVALID_BUFFER_SIZE: // - 61 - ret = "CL_INVALID_BUFFER_SIZE"; - break; - - } - - fprintf(logStream, "%s() failed with %d(%s)\n", funcName, status, ret); - fflush(logStream); - destroyGenInfo(); - exit(EXIT_COD_CL_ERROR); - } -} - -void -initOpenCl(void) -{ - cl_int status = 0; - cl_uint numPlatforms; - status = clGetPlatformIDs(0, NULL, &numPlatforms); - checkErrorFunc("clGetPlatformIDs", status); - - if (numPlatforms > 0) { - unsigned int i; - cl_platform_id* platforms = - (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); - - status = clGetPlatformIDs(numPlatforms, platforms, NULL); - checkErrorFunc("clGetPlatformIDs", status); - - for(i=0; i < numPlatforms; ++i) { - char pbuff[100]; - status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, - sizeof(pbuff), pbuff, NULL); - - checkErrorFunc("clGetPlatformInfo", status); - genInfo.platform = platforms[i]; - if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { - break; - } - } - free(platforms); - } - - // Init Device count - status = clGetDeviceIDs(genInfo.platform, genInfo.devType, 0, 0, - (cl_uint*)&genInfo.numDevices); - checkErrorFunc("clGetDeviceIDs", status); -} - -void -initDevice(int dev) -{ - cl_int status = 0; - cl_uint num_devices; - cl_device_id* deviceIDs = - (cl_device_id *)calloc(genInfo.numDevices, sizeof(cl_device_id)); - - status = clGetDeviceIDs(genInfo.platform, genInfo.devType, - genInfo.numDevices, deviceIDs, &num_devices); - checkErrorFunc("clGetDeviceIDs", status); - - genInfo.targetDevice.id = deviceIDs[dev]; - identifyDevice(&genInfo.targetDevice); - genInfo.deviceName = getDevName(&genInfo.targetDevice); - initCLDeviceInfoRec(&genInfo.targetDevice, &genInfo.deviceInfos); -} - -void -getContext(void) -{ - cl_int status = 0; - cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; - cl_device_id device = genInfo.targetDevice.id; - - props[1] = (cl_context_properties)genInfo.platform; - - genInfo.ctx = clCreateContext(props, 1, &device, - NULL, NULL, &status); - checkErrorFunc("clCreateContext", status); - - genInfo.queue = clCreateCommandQueue(genInfo.ctx, - device, - CL_QUEUE_PROFILING_ENABLE, - &status); - checkErrorFunc("clCreateCommandQueue",status); -} - -int -bitcount (unsigned int n) { - int count = 1 ; - - while (n) { - count *= 2; - n &= (n - 1) ; - } - return count ; -} - -bool -genKernel(GParam *param, CLBLASKernExtra* extra, MemoryPattern *pattern) -{ - cl_int status; - SolverKgen genKernel; - bool ret = false; - cl_device_id device; - char bopts[BUILD_OPTS_MAXLEN]; - - genKernel = pattern->sops->genKernel; - device = genInfo.targetDevice.id; - - setupBuildOpts(bopts, device, pattern); - param->kernel = makeKernel(device, genInfo.ctx, genKernel, - param->dims, ¶m->pgran, extra, bopts, NULL); - if (param->kernel != NULL) { - status = clGetProgramInfo(param->kernel->program, CL_PROGRAM_BINARY_SIZES, - sizeof(size_t), ¶m->binary_sizes, NULL); - - checkErrorFunc("clGetProgramInfo", status); - param->binaries = (char *)malloc(sizeof(char)*param->binary_sizes); - status = clGetProgramInfo(param->kernel->program, - CL_PROGRAM_BINARIES, - sizeof(char *), - ¶m->binaries, - NULL); - checkErrorFunc("clGetProgramInfo", status); - ret = true; - } - - return ret; -} - -void -convKExtraFlagToArg(KernelExtraFlags flags, CLBlasKargs* args) -{ - args->order = (flags & KEXTRA_COLUMN_MAJOR)?clblasColumnMajor: clblasRowMajor; - args->side = (flags & KEXTRA_SIDE_RIGHT)? clblasRight: clblasLeft; - args->uplo = (flags & KEXTRA_UPPER_TRIANG)?clblasUpper: clblasLower; - - args->transA = (flags & KEXTRA_TRANS_A)? clblasTrans: clblasNoTrans; - args->transB = (flags & KEXTRA_TRANS_B)? clblasTrans: clblasNoTrans; - if (isComplexType(args->dtype)) - { - args->transA = (flags & KEXTRA_CONJUGATE_A)?clblasConjTrans: args->transA; - args->transB = (flags & KEXTRA_CONJUGATE_B)?clblasConjTrans: args->transB; - } - args->diag = (flags & KEXTRA_UNIT_DIAGONAL)? clblasUnit: clblasNonUnit; - -} - -void -initCLBlasKArgDim(CLBlasKargs *args, MatrixInfo* mi, KernelExtraFlags extra) -{ - cl_int status; - float beta = ((extra & KEXTRA_BETA_ZERO) != 0)? 0.0f : 1.0f; - - memset( args, 0, sizeof(CLBlasKargs) ); - convKExtraFlagToArg( extra, args ); - args->dtype = mi->dtype; - - switch (mi->dtype) - { - case TYPE_FLOAT: - args->alpha.argFloat = 1.0; - args->beta.argFloat = beta; - break; - case TYPE_DOUBLE: - args->alpha.argDouble = 1.0; - args->beta.argFloat = beta; - break; - case TYPE_COMPLEX_FLOAT: - args->alpha.argFloatComplex.s[0] = 1.0; - args->alpha.argFloatComplex.s[1] = 0.0; - args->beta.argFloatComplex.s[0] = beta; - args->beta.argFloatComplex.s[1] = 0.0; - break; - - case TYPE_COMPLEX_DOUBLE: - args->alpha.argDoubleComplex.s[0] = 1.0; - args->alpha.argDoubleComplex.s[1] = 0.0; - args->beta.argDoubleComplex.s[0] = beta; - args->beta.argDoubleComplex.s[1] = 0.0; - break; - } - args->M = mi->M; - args->N = mi->N; - args->K = mi->K; - - args->A = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY, - args->N * args->M * mi->sizeDType, NULL, &status); - checkErrorFunc("clCreateBuffer",status); - mi->clA = args->A; - - status = clEnqueueWriteBuffer(genInfo.queue, args->A, CL_TRUE, 0, - args->N * args->M * mi->sizeDType, mi->A.v, 0, NULL, NULL); - checkErrorFunc("clEnqueueWriteBuffer",status); - - args->lda.matrix = args->K; - args->ldb.matrix = args->K; - args->ldc.matrix = args->M; - - args->B = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY , - args->K * args->N * mi->sizeDType, NULL, &status); - checkErrorFunc("clCreateBuffer",status); - mi->clB = args->B; - - status = clEnqueueWriteBuffer(genInfo.queue, args->B, CL_TRUE, 0, - args->K * args->N * mi->sizeDType, mi->B.v, 0, NULL, NULL); - checkErrorFunc("clEnqueueWriteBuffer",status); - - - args->C = clCreateBuffer(genInfo.ctx, CL_MEM_WRITE_ONLY , - args->M * args->K * mi->sizeDType, NULL, &status); - checkErrorFunc("clCreateBuffer",status); - - mi->clC = args->C; - args->addrBits = genInfo.deviceInfos.addressBits; - args->offsetM = 0; - args->offsetN = 0; - args->offA = 0; - args->offBX = 0; - args->offCY = 0; - args->scimage[0] = mi->clImgA; - args->scimage[1] = mi->clImgB; - -} - -void -initKernelArg( - MemoryPattern *pattern, - CLBlasKargs args, - cl_kernel kernel, - CLBlasKernelType kernType, - const CLBLASKernExtra *kextra) -{ - unsigned int ind; - unsigned int nrArgs; - cl_int status; - KernelArg karg[MAX_KERNEL_ARGS]; - - memset(karg, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS); - - args.kernType = kernType; - pattern->sops->assignKargs(karg, &args, kextra); - - status = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, - sizeof(nrArgs), &nrArgs, NULL); - - for (ind = 0; ((ind < nrArgs) && (status == CL_SUCCESS)); ind++) { - status = clSetKernelArg(kernel, ind, karg[ind].typeSize, - karg[ind].arg.data); - } -} - -double -runKernel( - cl_kernel kernel, - cl_device_id device, - MemoryPattern *pattern, - const GParam *param, - //unsigned int dim, - CLBlasKargs *args, - const void *extra, - unsigned int funcID) -{ - unsigned int nrComputeUnits; - size_t globalWorkSize[2]; - size_t localWorkSize[3]; - cl_event evt = NULL; - cl_int status; - double ret; - - status = clGetDeviceInfo(device, - CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(cl_uint), - (void*)&nrComputeUnits, - NULL); - checkErrorFunc("clGetDeviceInfo",status); - - ////////////////////////////////////////////////////////////////////////// - //calcWorkGroups(); - - if (pattern->sops->calcThreads) { - pattern->sops->calcThreads(globalWorkSize, param->dims, - ¶m->pgran, args, extra); - } - else { - ///// - SubproblemDim globDim; - SubproblemDim sd[MAX_SUBDIMS]; - - kargsToProbDims(&globDim, funcID, args, false); - sd[0] = param->dims[0]; - sd[1] = param->dims[1]; - - if ((param->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) { - if (pattern->sops->innerDecompositionAxis(args) == - DECOMP_AXIS_X) { - - /* - * these dimensions will not used more anywhere, so we can - * just swap them - */ - swapDimXY(&(sd[0])); - swapDimXY(&(sd[1])); - swapDimXY(&globDim); - } - } - calcGlobalThreads(globalWorkSize, &(sd[0]), - ¶m->pgran, globDim.y, globDim.x); - } - - localWorkSize[0] = param->pgran.wgSize[0]; - localWorkSize[1] = param->pgran.wgSize[1]; - localWorkSize[2] = 0; - - fflush(stdout); - status = clEnqueueNDRangeKernel(genInfo.queue, kernel, param->pgran.wgDim, - NULL, globalWorkSize, localWorkSize, - 0, NULL, &evt); - clReleaseKernel(kernel); - checkErrorFunc("clEnqueueNDRangeKernel",status); - -#if 0 - { - cl_ulong start, end; - - status = clFinish(genInfo.queue); - - checkErrorFunc("clFinish", status); - status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, - sizeof(cl_ulong), &start, NULL); - - checkErrorFunc("clGetEventProfilingInfo",status); - status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, - sizeof(cl_ulong), &end, NULL); - checkErrorFunc("clGetEventProfilingInfo",status); - - end -= start; - end /= 1000; - ret = (double)end/1000; - } -#else - { - nano_time_t time; - - status = flushAll(genInfo.queue); - checkErrorFunc("flushAll", status); - - time = getCurrentTime(); - status = waitForSuccessfulFinish(genInfo.queue, &evt); - checkErrorFunc("waitForSuccessfulFinish", status); - time = getCurrentTime() - time; - - ret = (double)conv2nanosec(time)/1000; - ret /= 1000; - } - -#endif - clReleaseEvent(evt); - return ret; -} - -void -subInitMatrixInfo( - MatrixInfo *matrixInfo, - DataType dt, - unsigned int sizeType) -{ - matrixInfo->dtype = dt; - matrixInfo->sizeDType = sizeType; - - matrixInfo->A.v = malloc(matrixInfo->N * matrixInfo->M * sizeType); - matrixInfo->B.v = malloc(matrixInfo->N * matrixInfo->K * sizeType); - matrixInfo->C.v = malloc(matrixInfo->M * matrixInfo->K * sizeType); -} - -void -initMatrixFloat(FPtr* m, int maxi) -{ - int i; - for (i = 0; i < maxi; ++i) { - m->f[i] = 1.0; - } -} - - -void -initMatrixInfo( - MatrixInfo *mi, - DataType dt, - DeviceInfo* di, - BlasExtraInfo* bExtra - ) -{ - unsigned int nDim; - BlasFunctionInfo* bFunc = bExtra->parent->parent; - - for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { - unsigned int i; - unsigned int dimension = getDimension(nDim, dt, di, bFunc->funcNo); - - if (bFunc != NULL && bFunc->initKNM != NULL) { - bFunc->initKNM(mi, dimension); - } - else { - mi->K = dimension; - mi->N = dimension; - mi->M = dimension; - } - - - switch (dt) - { - case TYPE_FLOAT: - subInitMatrixInfo(mi, dt, sizeof(cl_float)); - initMatrixFloat(&mi->A, mi->K * mi->M); - initMatrixFloat(&mi->B, mi->N * mi->K); - break; - case TYPE_DOUBLE: - subInitMatrixInfo(mi, dt, sizeof(cl_double)); - for (i = 0; i < mi->K * mi->M; ++i) { - mi->A.d[i] = 1.0; - } - for (i = 0; i < mi->N * mi->K; ++i) { - mi->B.d[i] = 1.0; - } - break; - case TYPE_COMPLEX_FLOAT: - subInitMatrixInfo(mi, dt, sizeof(cl_float2)); - for (i = 0; i < mi->K * mi->M; ++i) { - mi->A.f2[i].s[0] = 1.0; - mi->A.f2[i].s[1] = 0.0; - } - for (i = 0; i < mi->N * mi->K; ++i) { - mi->B.f2[i].s[0] = 1.0; - mi->B.f2[i].s[1] = 0.0; - } - break; - case TYPE_COMPLEX_DOUBLE: - subInitMatrixInfo(mi, dt, sizeof(cl_double2)); - for (i = 0; i < mi->K * mi->M; ++i) { - mi->A.d2[i].s[0] = 1.0; - mi->A.d2[i].s[1] = 0.0; - } - for (i = 0; i < mi->N * mi->K; ++i) { - mi->B.d2[i].s[0] = 1.0; - mi->B.d2[i].s[1] = 0.0; - } - break; - default: - exit (EXIT_COD_UNKNOWN_DATATYPE); - } - - mi->clA = NULL; - mi->clB = NULL; - mi->clC = NULL; - - mi->clImgA = NULL; - mi->clImgB = NULL; - mi->imgA = NULL; - mi->imgB = NULL; - - } -} - -void -releaseMemObjOne(MatrixInfo * mi) -{ - clReleaseMemObject(mi->clA); - clReleaseMemObject(mi->clB); - clReleaseMemObject(mi->clC); - - mi->clA = NULL; - mi->clB = NULL; - mi->clC = NULL; - mi->clImgA = NULL; - mi->imgA = NULL; - mi->clImgB = NULL; - mi->imgB = NULL; -} - -void -releaseMemObjAll(MatrixInfo * mi, BlasExtraInfo* bExtra) -{ - unsigned int nDim; - - for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { - releaseMemObjOne(mi); - } -} - -void -destroyMatrixInfo(MatrixInfo* mi, BlasExtraInfo* bExtra) -{ - unsigned int nDim; - - for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { - free(mi->A.v); - free(mi->B.v); - free(mi->C.v); - } -} - -void -logBest( - unsigned int * bestParam, - unsigned int nDim, - GParam * gp, - double * bestTime) -{ - fprintf(logStream, " %d %s = %f\n",bestParam[nDim], - gp->name, bestTime[nDim]); - fflush(logStream); -} -void -logCheckError(int dim) -{ - fprintf(logStream, " [%5d]: NOT FOUND\n", dim); -} -void -logCheck( - int dim, - SubproblemDim* sdim, - PGranularity* pgran, - double t, - double oldt, - bool kern) -{ - GParam gp; - - gp.dims[0] = sdim[0]; - gp.dims[1] = sdim[1]; - gp.dims[2] = sdim[2]; - - gp.pgran = *pgran; - createGParamName(&gp); - if (genInfo.aExtendedOutput) { - if (oldt == 0) { - fprintf(logStream, " [%5d]: %s - %7g ",dim, gp.name, t); - oldt = t; - } - if (fabs(t - oldt) < 0.0001) { - fprintf(logStream, (kern) ? "* " : "+ "); - } - else { - fprintf(logStream, "- "); - } - } - fflush(logStream); -} - - -void -logParamName(GParam * params, int cur, int max) -{ - if (genInfo.aExtendedOutput) { - - fprintf(logStream, "%3i/%-3i, %s :", cur, max, params->name); - fflush(logStream); - - - /* For Debug GEMM, Memmory pattern #4 - - fprintf(logStream, - "%3i/%-3i; wg: %dx%d; iB: %lux%lu; gB: %lux%lu; bw: %lu", - cur, - max, - params->pgran.wgSize[1], - params->pgran.wgSize[0], - params->dims[1].x, - params->dims[1].y, - params->dims[0].x, - params->dims[0].y, - params->dims[0].bwidth); - */ - fflush(logStream); - - } - else { - if (cur > 0) { - fprintf(logStream, "\b\b\b\b\b\b\b"); - } - fprintf(logStream, "%5.2f%% ", genInfo.last - + (genInfo.next - genInfo.last)*cur/max); - fflush(logStream); - } - -} - -void -logTime(double time) -{ - if (genInfo.aExtendedOutput) { - fprintf(logStream, " %7.2f", time); - fflush(logStream); - } -} - -void -logKernalGen(void) -{ - if (genInfo.aExtendedOutput) { - fprintf(logStream, " *"); - fflush(logStream); - } -} - -void -logPattern(const char * patternName) -{ - if ( genInfo.aExtendedOutput || genInfo.patternName != patternName ) { - fprintf(logStream, "%s is being tuned, progress: ", patternName); - if (genInfo.aExtendedOutput) { - fprintf(logStream, "\n"); - }else { - fprintf(logStream, " "); - } - fflush(logStream); - genInfo.patternName = patternName; - } - -} -void -logEndString(void) -{ - if (genInfo.aExtendedOutput) { - fprintf(logStream, "\n"); - fflush(logStream); - } -} - -void -logExtraFlag( - KernelExtraFlags flags, - KernelExtraFlags flag, - const char * trueName, - const char * falseName - ) -{ - if ((flags & flag) > 0) { - fprintf(logStream, "%s", trueName); - } - else { - fprintf(logStream, "%s", falseName); - } - -} -void -logEndPattern(unsigned int func, unsigned int patt) -{ - //bool isFunc = (genInfo.aFunc == -1 || genInfo.aFunc == (int)func); - bool isFunc = genInfo.aFunc[func]; - bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == (int)patt); - - if (!(isFunc && isPattern)) { - return; - } - - if (!genInfo.aExtendedOutput) { - fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); - fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); - fprintf(logStream, " tuning is complete. \n"); - } - fprintf(logStream, "\n"); - fflush(logStream); -} - -void -logExtra(BlasExtraInfo* bExtra) -{ - - const char* strType = ""; - const char* strTrans = ""; - - KernelExtraFlags flags = bExtra->flags; - - if (!genInfo.aExtendedOutput) { - - fprintf(logStream, "\b\b\b\b\b\b\b\b %5.2f%% ", genInfo.last); - } - else { - fprintf(logStream, " Flag (%d):(clblas*)", flags); - - strTrans = (flags & KEXTRA_TRANS_A)? " Trans": " NoTrans"; - logExtraFlag(flags, KEXTRA_CONJUGATE_A, " ConjTrans", strTrans); - fprintf(logStream, "(A)"); - strTrans = (flags & KEXTRA_TRANS_B)? " Trans": " NoTrans"; - logExtraFlag(flags, KEXTRA_CONJUGATE_B, " ConjTrans", strTrans); - fprintf(logStream, "(B)"); - logExtraFlag(flags, KEXTRA_COLUMN_MAJOR, " ColumnMajor", " RowMajor"); - logExtraFlag(flags, KEXTRA_UPPER_TRIANG, " Upper", " Lower"); - logExtraFlag(flags, KEXTRA_SIDE_RIGHT, " Right", " Left"); - - fprintf(logStream, " \n"); - - switch (bExtra->dtype) - { - case TYPE_FLOAT: strType = "FLOAT"; break; - case TYPE_DOUBLE: strType = "DOUBLE"; break; - case TYPE_COMPLEX_FLOAT: strType = "COMPLEX_FLOAT"; break; - case TYPE_COMPLEX_DOUBLE: strType = "COMPLEX_DOUBLE"; break; - } - fprintf(logStream, " TYPE = %s:", strType); - } - - - fflush(logStream); - logEndString(); -} - -void -logError(void) -{ - fprintf(logStream, " An internal kernel build error occurred!\n"); - fflush(logStream); -} - -static void -releaseSCImage(void** buf, cl_mem* clImg) -{ - if (*clImg != NULL) { - clReleaseMemObject(*clImg); - *clImg = NULL; - free(*buf); - *buf = NULL; - } -} - -static cl_int -createSCImage( - void **buf, - cl_mem *image) -{ - cl_image_format format = { CL_RGBA, CL_FLOAT }; - size_t width, height, maxWidth, maxHeight; - cl_int status; - cl_ulong memSize; - cl_device_id device; - cl_int err; - - err = clGetContextInfo(genInfo.ctx, CL_CONTEXT_DEVICES, - sizeof(device), &device, NULL); - - if (err != CL_SUCCESS) { - return err; - } - err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, - sizeof(memSize), &memSize, NULL); - - if (err != CL_SUCCESS) { - return err; - } - err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, - sizeof(maxWidth), &maxWidth, NULL); - - if (err != CL_SUCCESS) { - return err; - } - err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, - sizeof(maxHeight), &maxHeight, NULL); - - if (err != CL_SUCCESS) { - return err; - } - // some functions need 2 scratch images - memSize /= 2; - - height = (size_t)sqrt((double)memSize / sizeof(cl_float)); - width = height / 4; - if (height > maxHeight) { - height = maxHeight; - } - - if (width > maxWidth) { - width = maxWidth; - } - - *buf = calloc(width * height, 4 * sizeof(cl_float)); - if (buf == NULL) { - return CL_OUT_OF_HOST_MEMORY; - } - *image = clCreateImage2D(genInfo.ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - &format, width, height, 0, *buf, &status); - if (*image == NULL) { - free(*buf); - *buf = NULL; - return status; - } - return CL_SUCCESS; -} - -static void -generatePrepKernel( - cl_device_id device, - MemoryPattern *pattern, - GParam * param, - CLBlasKargs *args, - CLBLASKernExtra *extra, - CLBlasKernelType kernType) -{ - PGranularity pgran; - Kernel *k = NULL; - size_t bSize; - char* bin; - cl_int status; - cl_ulong ldsSize; - CLBlasKernelType kernTypeOld = extra->kernType; - - DUMMY_ARG_USAGE(args); - - extra->kernType = kernType; - clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, - sizeof(cl_ulong), &ldsSize, NULL); - - pgran = param->pgran; - - k = makeKernel( - device, - genInfo.ctx, - pattern->sops->genKernel, - param->dims, - &pgran, - extra, - NULL, - NULL); - - status = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, - sizeof(size_t), &bSize, NULL); - checkErrorFunc("clGetProgramInfo", status); - - bin = (char *)malloc(sizeof(char)*bSize); - status = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(char*), - &bin, NULL); - checkErrorFunc("clGetProgramInfo", status); - - if (kernType == CLBLAS_PREP_A_KERNEL) { - param->binariesA = bin; - param->binary_sizesA = bSize; - param->kernelPrepA = k; - } - - if (kernType == CLBLAS_PREP_B_KERNEL) { - param->binariesB = bin; - param->binary_sizesB = bSize; - param->kernelPrepB = k; - } - - extra->kernType = kernTypeOld; -} - -void -delGParam(GParam * gp) -{ - if (gp != NULL) { - gp->count --; - if (gp->count ==0){ - destroyKernels(gp); - free(gp->binaries); - free(gp->binariesA); - free(gp->binariesB); - free(gp); - gp = NULL; - } - } -} - -void -setFlagsDependentOnDevice( - CLBlasKargs* args, - CLBLASKernExtra* extra, - GParam* parCur, - unsigned int func, - unsigned int patt - ) -{ - SolutionStep step; - cl_int status; - - step.args = *args; - step.cmdQueue = genInfo.queue; - step.extraFlags = extra->flags; - step.funcID = func; - step.kernels[0] = NULL; - step.kernels[1] = NULL; - step.kernels[2] = NULL; - //step.node = NULL; - step.numEventsInWaitList = 0; - step.patternID = patt; - step.pgran = parCur->pgran; - step.subdims[0] = parCur->dims[0]; - step.subdims[1] = parCur->dims[1]; - step.subdims[2] = parCur->dims[2]; - - step.device.id = genInfo.targetDevice.id; - status = identifyDevice(&step.device); - checkErrorFunc("identifyDevice", status); - - if (step.device.ident.vendor == VENDOR_AMD) { - extra->flags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD); - } - selectVectorization(&step, extra); -} - -bool -genAllKernel( - CLBlasKargs* args, - CLBLASKernExtra extra, - GParam* parCur, - MemoryPattern * pattern, - unsigned int func, - unsigned int patt - ) -{ - bool ret; - cl_device_id device = genInfo.targetDevice.id; - - if (func == (unsigned int)CLBLAS_SYRK || - func == (unsigned int)CLBLAS_SYR2K) { - extra.flags |= KEXTRA_SYRK_SEPARATE_DIAGONAL; - } - - setFlagsDependentOnDevice(args, &extra, parCur, func, patt); - - // fixup work group size in respect with desired work dispatch order - if ((parCur->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) { - if (pattern->sops->innerDecompositionAxis(args) == DECOMP_AXIS_X) { - unsigned int u; - - u = parCur->pgran.wgSize[0]; - parCur->pgran.wgSize[0] = parCur->pgran.wgSize[1]; - parCur->pgran.wgSize[1] = u; - } - } - - if (pattern->sops->fixupArgs) { - pattern->sops->fixupArgs(args, parCur->dims, &extra); - } - - ret = genKernel(parCur, &extra, pattern); - - if (patternUseImages(pattern)) { - generatePrepKernel(device, pattern, parCur, args, &extra, - CLBLAS_PREP_A_KERNEL); - - generatePrepKernel(device, pattern, parCur, args, &extra, - CLBLAS_PREP_B_KERNEL); - } - return ret; -} - -double -runAllKernel( - MemoryPattern * pattern, - CLBlasKargs *args, - GParam* parCur, - unsigned int funcId, - double bestTime) -{ - double time; - double minTime = 1e30; - int i; - cl_device_id device = genInfo.targetDevice.id; - int max_run_kernel = MAX_RUN_KERNEL + (funcBlasLevel(funcId) == 2 ? 7 : 0); - - - cl_int status; - cl_kernel kernel; - if (patternUseImages(pattern)) { - /////////////// A ////////////// - cl_kernel kPrepA; - cl_kernel kPrepB; - - status = clCreateKernelsInProgram( - parCur->kernelPrepA->program, 1, &kPrepA, NULL); - checkErrorFunc("clGetProgramInfo", status); - - initKernelArg(pattern, *args, kPrepA, CLBLAS_PREP_A_KERNEL, - parCur->kernelPrepA->extra); - - args->kernType = CLBLAS_PREP_A_KERNEL; - time = runKernel(kPrepA, device, pattern, parCur, - args, parCur->kernelPrepA->extra, funcId); - - /////////////// B ////////////// - status = clCreateKernelsInProgram( - parCur->kernelPrepB->program, 1, &kPrepB, NULL); - checkErrorFunc("clGetProgramInfo", status); - - - initKernelArg(pattern, *args, kPrepB, CLBLAS_PREP_B_KERNEL, - parCur->kernelPrepB->extra); - - args->kernType = CLBLAS_PREP_B_KERNEL; - time = runKernel(kPrepB, device, pattern, parCur, - args, parCur->kernelPrepB->extra, funcId); - args->kernType = CLBLAS_COMPUTING_KERNEL; - } - - for (i = 0; i < max_run_kernel; ++i) { - status = clCreateKernelsInProgram(parCur->kernel->program, 1, &kernel, NULL); - checkErrorFunc("clGetProgramInfo", status); - - initKernelArg(pattern, *args, kernel, CLBLAS_COMPUTING_KERNEL, - parCur->kernel->extra); - - time = runKernel(kernel, device, pattern, parCur, args, - parCur->kernel->extra, funcId); - minTime = fmin(time, minTime); - if (minTime > bestTime*2 && i >= max_run_kernel/2 && minTime > 2) { - break; - } - } - return minTime; -} - -GParam* -createParCur(SubDimInfo *sdi) -{ - GParam* parCur = calloc(1, sizeof(GParam)); - parCur->count ++; - - parCur->dims[0] = sdi->sdim[0]; - parCur->dims[1] = sdi->sdim[1]; - parCur->dims[2] = sdi->sdim[2]; - parCur->pgran = sdi->pgran; - - createGParamName(parCur); - return parCur; -} - -GParam* -createParCur2(GParam* sdi) -{ - GParam* parCur = calloc(1, sizeof(GParam)); - parCur->count = 1; - - parCur->dims[0] = sdi->dims[0]; - parCur->dims[1] = sdi->dims[1]; - parCur->dims[2] = sdi->dims[2]; - parCur->pgran = sdi->pgran; - parCur->vecLen = sdi->vecLen; - parCur->binaries = NULL; - parCur->binariesA = NULL; - parCur->binariesB = NULL; - parCur->binary_sizes = 0; - parCur->binary_sizesA = 0; - parCur->binary_sizesB = 0; - parCur->time = sdi->time; - - createGParamName(parCur); - return parCur; -} - -static void -setParam(BlasParamInfo* bParam, double time, GParam* parCur) -{ - bParam->time = time; - bParam->pGran = parCur->pgran; - bParam->sDim[0] = parCur->dims[0]; - bParam->sDim[1] = parCur->dims[1]; - bParam->sDim[2] = parCur->dims[2]; - // - if (genInfo.aIsKernel) { - bParam->kSize[0] = (unsigned int)parCur->binary_sizes; - bParam->kSize[1] = (unsigned int)parCur->binary_sizesA; - bParam->kSize[2] = (unsigned int)parCur->binary_sizesB; - } - else { - bParam->kSize[0] = 0; - bParam->kSize[1] = 0; - bParam->kSize[2] = 0; - } -} - -int VISIBILITY_HIDDEN -comp(const void *i, const void *j) -{ - return *(double *)i < *(double *)j; -} - -void VISIBILITY_HIDDEN -initCLBLASExtra(CLBLASKernExtra* extra, BlasExtraInfo* bExtra) -{ - memset( extra, 0, sizeof(CLBLASKernExtra) ); -// if (bExtra) { - extra->dtype = bExtra->dtype; - extra->flags = bExtra->flags; -// extra->vecLen = bExtra->vecLen; -// } -} -#ifdef TEST_LOG - -typedef struct LOG_FILE -{ - FILE* f; - bool readElem; - double t1, t2, t3; - double tall; -}LOG_FILE; - -typedef struct LOG_STAT -{ - int count; - double minTime; - double maxTime; - double midleTime; -}LOG_STAT; - -void -openLogFile(LOG_FILE* lf, char* fileName) -{ - if ((lf->f = fopen(fileName, "a+")) != NULL) { - - } - -} -void -closeLogFile(LOG_FILE* lf) -{ - fclose(lf->f); -} -bool -readElemLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen) -{ - unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl; - double t1, t2, t3, tall; - fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", - &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall); - if ( l0x == sd[0].x && l0y == sd[0].y && l0w == sd[0].bwidth - && l1x == sd[1].x && l1y == sd[1].y && l1w == sd[1].bwidth - && vl == vecLen) { - lf->t1 = t1; - lf->t2 = t2; - lf->t3 = t3; - lf->tall = tall; - return true; - } - return false; -} - -double -readLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen) -{ - lf->t3 = 0; - lf->readElem = readElemLogFile(lf, sd, vecLen); - if (!lf->readElem) { - rewind(lf->f); - while (!lf->readElem && !feof(lf->f)) { - lf->readElem = readElemLogFile(lf, sd, vecLen); - } - } - return lf->t3; -} - -double -saveLogFile(LOG_FILE* lf, - SubproblemDim* sd, - unsigned int vecLen, - double* time, - double timeAll) -{ - if (!lf->readElem) { - fprintf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", - (unsigned int)sd[0].x, (unsigned int)sd[0].y, - (unsigned int)sd[0].bwidth, - (unsigned int)sd[1].x, (unsigned int)sd[1].y, - (unsigned int)sd[1].bwidth, - vecLen, - time[0], time[1], time[2], timeAll); - } - return lf->t3; -} - -void -getBestVariant(LOG_FILE* lf) -{ - rewind(lf->f); - lf->tall = 0; - while (!feof(lf->f)) { - unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl; - double t1, t2, t3, tall; - fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", - &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall); - lf->t1 = fmin(t1, lf->t1); - lf->t2 = fmin(t2, lf->t2); - lf->t3 = fmin(t3, lf->t3); - lf->tall += tall; - } -} - -#endif - -static void -findBestParams( - MemoryPattern *pattern, - unsigned int func, - unsigned int patt, - bool isEnvPattSelected, - BlasExtraInfo* bExtra, - GParam* bestParam[DIMARRAYCOUNT]) -{ - unsigned int nDim; - SolutionStep step; - MatrixInfo mi [DIMARRAYCOUNT]; - //cl_kernel kernel_old[MAX_CLBLAS_KERNELS_PER_STEP]; - double time[DIMARRAYCOUNT]; - CLBLASKernExtra extra; - SubDimInfo sdi; - void* imgA = NULL; - cl_mem clImgA = NULL; - void* imgB = NULL; - cl_mem clImgB = NULL; - int curStep; - unsigned int dimension; - -#ifdef TEST_LOG - LOG_FILE lf; - double all_time = 0; - double step_time; - char str[1000]; -#endif - - memset(time, 0, sizeof(time)); - initCLBLASExtra(&extra, bExtra); - -#ifdef TEST_LOG - sprintf(str, "test_%d_%d_%d_%d.log",func, patt, extra.dtype, extra.flags); - openLogFile(&lf,str); -#endif - - // create images - if (patternUseImages(pattern)) { - cl_int status; - // Init Image - status = createSCImage(&imgA, &clImgA); - checkErrorFunc("createSCImage", status); - status = createSCImage(&imgB, &clImgB); - checkErrorFunc("createSCImage", status); - } - - initSubDimInfo(&sdi, pattern, &genInfo.deviceInfos, func, patt, - extra.dtype, extra.flags); - - initMatrixInfo(mi, extra.dtype, &genInfo.deviceInfos, bExtra); - resetSubdim(&sdi); - - curStep = 0; - while (nextSubdim(&sdi, genInfo.aMaxparam, time[bExtra->numParam - 1])) - { - GParam* parCur; - GParam* lastbest[DIMARRAYCOUNT]; - bool isKernelValid; - if (bExtra) { - parCur = createParCur(&sdi); - } - - globalDim++; - curStep++; - logParamName(parCur, curStep, sdi.varCount); - -#ifdef TEST_LOG - step_time = getCurrentTime(); - time[DIMARRAY_BIG] = readLogFile(&lf, sdi.sdim, sdi.vecLen); - if (!lf.readElem) { -#endif - -#ifdef TEST_LOG - } - else { - time[DIMARRAY_SMALL] = lf.t1; - time[DIMARRAY_MIDDLE] = lf.t2; - time[DIMARRAY_BIG] = lf.t3; - step_time = lf.tall; - } -#endif - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bExtra){ - lastbest[nDim] = NULL; - } - } - - isKernelValid = 0; - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - - BlasParamInfo* bParam; - // can current combination of flags be handled by selected pattern - bool isProbSupported = false; - - dimension = getDimension(nDim, extra.dtype, - &genInfo.deviceInfos, - bExtra->parent->parent->funcNo); - - // setup kernel arguments - if (patternUseImages(pattern)) { - // Init Image - mi[nDim].imgA = imgA; - mi[nDim].clImgA =clImgA; - mi[nDim].imgB = imgB; - mi[nDim].clImgB =clImgB; - } - - // Incorrect subdimension for a given size of the matrix - if ( dimension < sdi.sdim[0].x || - dimension % sdi.sdim[0].x != 0 || - dimension < sdi.sdim[0].y || - dimension % sdi.sdim[0].y != 0 || - dimension < sdi.sdim[0].bwidth || - dimension % sdi.sdim[0].bwidth != 0 - ) { - - releaseMemObjOne(mi + nDim); - if (genInfo.aExtendedOutput) { - fprintf(logStream, " "); - } - // write dummy data - time[nDim] = -1; - continue; - } - - step.extraFlags = extra.flags; - step.funcID = func; - initCLBlasKArgDim( &step.args, mi + nDim, extra.flags ); - - // assuming that all - // "old-fashioned" patterns, providing no performance estimation - // function can handle any set of arguments/flags - if ( NULL == pattern->sops->getPatternPerf || - pattern->sops->getPatternPerf( step.extraFlags, - (void*)&step.args ) >= 0 ) { - - isProbSupported = true; - } - else { - isProbSupported = false; - } - - // if current flags and dimensions are not optimal for current - // pattern - skip building and running kernel. - // But if the pattern is selected by environment - // and can handle current problem - tune it anyway. - if ( (patt != selectPattern( &step, 0 ) && - (!isEnvPattSelected || !isProbSupported)) ) { - - releaseMemObjOne(mi + nDim); - - // write dummy data - time[nDim] = -1; - bestParam[nDim] = NULL; - continue; - } - - if ( 0 == isKernelValid ) { - isKernelValid = genAllKernel( - &step.args, - extra, - parCur, - pattern, - func, - patt); - - logKernalGen(); - } - - if ( 0 == isKernelValid ) { - - releaseMemObjOne(mi + nDim); - logError(); - break; - } - bParam = &(bExtra->param[nDim]); - -#ifdef TEST_LOG - if (!lf.readElem) { -#endif - time[nDim] = runAllKernel(pattern, &step.args, parCur, - func, bParam->time); - releaseMemObjOne(mi + nDim); -#ifdef TEST_LOG - } -#endif - logTime(time[nDim]); - if (bParam->time > time[nDim]) { - if (bExtra) { - BlasParamInfo* bParamNT = &(bExtra->param[nDim]); - setParam(bParamNT, time[nDim], parCur); - lastbest[nDim] = bestParam[nDim]; - bestParam[nDim] = parCur; - parCur->count++; - } - } - } - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bExtra) { - delGParam(lastbest[nDim]); - lastbest[nDim] = NULL; - } - } - -#ifdef TEST_LOG - step_time = ((double)(getCurrentTime()) - step_time)/1000000; - saveLogFile(&lf, sdi.sdim, sdi.vecLen, time, step_time); - if (lf.readElem) { - step_time = lf.tall; - } - logTime(step_time); - all_time += step_time; -#endif - logEndString(); - releaseMemObjAll(mi, bExtra); - if (bExtra) { - delGParam(parCur); - } - } - -#ifdef TEST_LOG - // Show log - resetSubdim(&sdi); - double t; - double all = 0; - int count = 0; - - time[DIMARRAY_SMALL] = 5000.0; - time[DIMARRAY_MIDDLE] = 5000.0; - time[DIMARRAY_BIG] = 5000.0; - - sdi.returnAll = true; - do - { - t = readLogFile(&lf, sdi.sdim, sdi.vecLen); - if (lf.readElem) { - time[DIMARRAY_SMALL] = fmin(lf.t1, time[DIMARRAY_SMALL]); - time[DIMARRAY_MIDDLE] = fmin(lf.t2, time[DIMARRAY_MIDDLE]); - time[DIMARRAY_BIG] = fmin(lf.t3, time[DIMARRAY_BIG]); - all+= lf.tall; - count++; - } - else { - printf ("^"); - } - - } while (nextSubdim(&sdi, genInfo.aMaxparam, t)); - -#ifdef TEST_LOG - getBestVariant(&lf); -#endif - - lf.t1 = time[DIMARRAY_SMALL]; - lf.t2 = time[DIMARRAY_MIDDLE]; - lf.t3 = time[DIMARRAY_BIG]; - lf.tall = all; - - fprintf(logStream, "---------------------------------------------------\n"); - fprintf(logStream, " steps time1 time2 time3 AllTime \n"); - int tmin = (int)(lf.tall/1000/60); - int tsec = (int)(lf.tall/1000) - tmin*60; - fprintf(logStream, " --> Best %5d %7.2lf %7.2lf %7.2lf %2d:%2d \n", - count, lf.t1, lf.t2, lf.t3, tmin, tsec); - tmin = (int)(all_time/1000/60); - tsec = (int)(all_time/1000) - tmin*60; - fprintf(logStream, " --> Fast %5d %7.2lf %7.2lf %7.2lf %2d:%2d\n", - curStep, - bExtra->param[DIMARRAY_SMALL].time, - bExtra->param[DIMARRAY_MIDDLE].time, - bExtra->param[DIMARRAY_BIG].time, - tmin,tsec); - - globalFastTime += all_time; - globalTime += lf.tall; - - closeLogFile(&lf); -#endif - - - - logEndString(); - // Release image - releaseSCImage(&imgA, &clImgA); - releaseSCImage(&imgB, &clImgB); - - destroyMatrixInfo(mi, bExtra); -} - -double -checkData( - TargetDevice* devID, - const MemoryPattern * pattern, - DataType dtype, - KernelExtraFlags flags, - int dim, - double oldt) -{ - SubproblemDim sdim[MAX_SUBDIMS]; - PGranularity pgran; - double time; - int i; - unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP]; - size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP]; - - int status; - - for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; ++i) { - buffer[i] = NULL; - sizeBuffer[i] = 0; - } - - status = getGranularityInfo(devID, pattern->name, dtype, flags, dim, sdim, - &pgran, &time); - - - if (status == GF_SUCCESS) { - status = getKernelInfo(devID, pattern->name, dtype, flags, dim, buffer, - sizeBuffer); - - logCheck(dim, sdim, &pgran, time, oldt, buffer[0] != NULL); - } - else { - logCheckError(dim); - } - free(buffer[0]); - free(buffer[1]); - free(buffer[2]); - return time; -} - -void -logDimension(BlasFunctionInfo* bFunc) -{ - int func = bFunc->funcNo; - int i; - - if (genInfo.aExtendedOutput) { - printf("FLOAT "); - for (i = 0; i < DIMARRAYCOUNT; ++i) { - printf(" %6u", - getDimension(i, TYPE_FLOAT, &genInfo.deviceInfos, func)); - } - printf("\n"); - printf("DOUBLE "); - for (i = 0; i < DIMARRAYCOUNT; ++i) { - printf(" %6u", - getDimension(i, TYPE_DOUBLE, &genInfo.deviceInfos, func)); - } - printf("\n"); - printf("COMPLEX FLOAT "); - for (i = 0; i < DIMARRAYCOUNT; ++i) { - printf(" %6u", - getDimension(i, TYPE_COMPLEX_FLOAT, &genInfo.deviceInfos, func)); - } - printf("\n"); - printf("COMPLEX DOUBLE "); - for (i = 0; i < DIMARRAYCOUNT; ++i) { - printf(" %6u", - getDimension(i, TYPE_COMPLEX_DOUBLE, &genInfo.deviceInfos, func)); - } - printf("\n"); - - } - -} - -void -calcExtraCount(int index, int indexCount) -{ - - genInfo.last = (double)index/indexCount*100; - genInfo.next = (double)(index + 1)/indexCount*100; -} - -int -isFlag(BlasExtraInfo* info, KernelExtraFlags flag) -{ - return (info->flags & flag) == flag; -} - -int -isNoFlag(BlasExtraInfo* info, KernelExtraFlags flag) -{ - return (info->flags & flag) == 0; -} - -/* Check if current set of flags and datatype should be evaluated - for current function and pattern. - It may be skipped due to compiler/runtime bugs and if it is - considered slow for any of the checked problem sizes - - If skipSlowPatt parameter is set to false - pattern considered slow - is tuned anyway, otherwise, it is skipped */ -bool -skipFlags(BlasExtraInfo* info, - int patt, - int func, - DeviceInfo* pDI, - bool skipSlowPatt) -{ - bool b = false; - int i; - bool shouldTunePatt = false; - SolutionStep step; - - memset( &step, 0, sizeof(SolutionStep) ); - - step.funcID = func; - step.patternID = patt; - step.extraFlags = info->flags; - - (void)func; - (void)patt; - - /* evaluate problem sizes */ - /* skip pattern, if it is not optimal for any of the dimensions - for current flags */ - if ( skipSlowPatt ) { - - for ( i = 0; i < DIMARRAYCOUNT; i++ ) { - - step.args.M = getDimension( i, info->dtype, pDI, func ); - step.args.N = getDimension( i, info->dtype, pDI, func ); - step.args.K = getDimension( i, info->dtype, pDI, func ); - - if ( selectPattern( &step, 0 ) == (unsigned int)patt ){ - shouldTunePatt = true; - } - } - - if( false == shouldTunePatt ){ - return true; - } - } - - b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_FLOAT); - b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_DOUBLE); - - /* - * WORKAROUND for WINDOWS: Now, for many subproblem dimensions, - * when tuning TRMM, SYRK, SYR2K functions - * for complex-double type, gives BSoD. - */ - -#if defined(_WIN32) - b |= (func == CLBLAS_TRSM) && (info->dtype == TYPE_COMPLEX_DOUBLE); - b |= (func == CLBLAS_SYRK) && (info->dtype == TYPE_COMPLEX_DOUBLE); - b |= (func == CLBLAS_SYR2K) && (info->dtype == TYPE_COMPLEX_DOUBLE); -#endif - b |= !info->isUseForTunning; - return b; -} - -bool -isFilter(BlasExtraInfo* info, int patt, int func) -{ - int dType = (int)info->dtype; - int flag = (int)info->flags; - - bool isFunc = genInfo.aFunc[func]; - bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == patt); - bool isDataType = genInfo.aDType[dType]; - bool isFlag = (genInfo.aFlag == -1 || genInfo.aFlag == flag); - - return (!(isFunc && isPattern && isDataType && isFlag)); -} - -void -initParamsTime(BlasExtraInfo* bExtra) -{ - unsigned int nDim; - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bExtra){ - bExtra->param[nDim].time += 1e50; - } - } -} - -void -saveBestParams( - BlasExtraInfo* bExtra, - GParam* bestParam[DIMARRAYCOUNT]) -{ - unsigned int nDim; - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bExtra){ - BlasParamInfo* bParam = &bExtra->param[nDim]; - - if (bestParam[nDim] != NULL){ - saveBestParam(&genInfo.targetDevice, bParam); - } - } - } -} - -void -deleteGParams (BlasExtraInfo* bExtra, GParam* bestParam[DIMARRAYCOUNT]) -{ - unsigned int nDim; - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bExtra){ - delGParam(bestParam[nDim]); - } - } -} - -void -checkDatas(BlasExtraInfo* bExtra, const MemoryPattern* pattern) -{ - unsigned int nDim; - double t; - unsigned int dimension; - int func = bExtra->parent->parent->funcNo; - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - t = 0; - if (bExtra) { - dimension = getDimension(nDim, bExtra->dtype, - &genInfo.deviceInfos, func); - if(nDim == BANK_ALIGNED_CASE_RECORD_IDX) { - dimension = 0; - } - // TODO add implementation checkData - (void) pattern; - t = checkData(&genInfo.targetDevice, pattern, bExtra->dtype, - bExtra->flags, dimension, t); - } - logEndString(); - } -} - -void -generateKernelForOthersFlag( BlasExtraInfo* bExtra, - GParam* bestParam[DIMARRAYCOUNT], - MemoryPattern* pattern) -{ - unsigned int nExtra; - BlasPatternInfo* bPatt = bExtra->parent; - BlasFunctionInfo* bFunc = bPatt->parent; - BlasExtraInfo* bExtraOther; - CLBLASKernExtra extra; - GParam* bestParamOther[DIMARRAYCOUNT]; - unsigned int nDim; - CLBlasKargs args; - - memset( bestParamOther, 0, sizeof(GParam*)*DIMARRAYCOUNT ); - - for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) { - - bool isMaskFlag; - bool isEqFlag; - bool isDataType; - unsigned int mask; - - bExtraOther = &(bPatt->extra[nExtra]); - - mask = bExtraOther->flags & bFunc->maskForTuningsKernel; - isMaskFlag = mask == bExtra->flags; - isEqFlag = bExtraOther->flags == bExtra->flags; - isDataType = bExtra->dtype == bExtraOther->dtype; - - if (isDataType && isMaskFlag && !isEqFlag) { - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - if (bestParam[nDim] == NULL) { - continue; - } - bestParamOther[nDim] = createParCur2(bestParam[nDim]); - } - - for (nDim = 0; nDim < bExtra->numParam; nDim++) { - unsigned int nd; - - if (bestParam[nDim] == NULL) { - continue; - } - for (nd = 0; nd < nDim; ++nd) { - if (bestParam[nDim] == bestParam[nd]) { - bestParamOther[nDim] = bestParamOther[nd]; - bestParamOther[nDim]->count++; - } - } - - // If the user selected that they want to store the kernel binaries to disk, - // and we do not have those binaries, generate them again - if (genInfo.aIsKernel && bestParamOther[nDim]->kernel == NULL) { - MatrixInfo mi [DIMARRAYCOUNT]; - unsigned int func = bFunc->funcNo; - unsigned int patt = bPatt->pattNo; - - // Initialize resources to generate kernels in genAllKernel - initCLBLASExtra(&extra, bExtra); - initMatrixInfo( mi, extra.dtype, &genInfo.deviceInfos, bExtra ); - initCLBlasKArgDim( &args, mi, extra.flags ); - - genAllKernel(&args, extra, bestParamOther[nDim], pattern, func, patt); - - // Free those resources when finished - releaseMemObjAll( mi, bExtra ); - destroyMatrixInfo( mi, bExtra ); - - logKernalGen( ); - } - - // This stores the kernel binaries to disk - saveBestParams(bExtraOther, bestParamOther); - } - deleteGParams(bExtraOther, bestParamOther); - } - } -} - -BlasPatternInfo* -getPattern(BlasFunctionID fid, int pid) -{ - BlasFunctionInfo* pFunc = getBlasFunctionInfo(&genInfo.targetDevice, fid); - return &pFunc->pattInfo[pid]; -} - -void -configurePattern(void) -{ - // Initialization specific to the handler function. - //getPattern(CLBLAS_XXXX, 0)->isPGValid = ; - //getPattern(CLBLAS_XXXX, 0)->initSubdim = ; -} - -bool -isRebuild(BlasExtraInfo* bExtra) -{ - unsigned int nDim; - bool ret = genInfo.aCommand != C_DEFAULT; - - for (nDim = 0; nDim < bExtra->numParam; ++nDim) { - BlasParamInfo* bParam = &bExtra->param[nDim]; - - ret |= bParam->sstatus == SS_NOLOAD; - if (bParam->offset == 0 ) { - printf("*****\n"); - } - } - return ret; -} - - -void -createFile(void) -{ - unsigned int funcId; - unsigned int pattId = 0; - unsigned int envPattId = 0; - bool isEnvPattSelected = false; - unsigned int dev; - - // This intializes global genInfo with either the last detected platform, or the - // first AMD platform it finds. It records the number of devices in that platform. - initOpenCl( ); - - // For each devices - for (dev = 0; dev < genInfo.numDevices; dev++) { - initDevice(dev); - - // The following creates the .kdb file on disk according to the set environment variable - writeStorageCache(&genInfo.targetDevice); - - // The following creates the OpenCL context and commanqueue for the first device in global genInfo struct - getContext( ); - - // Does nothing; nop - configurePattern( ); - - // for each function - for (funcId = 0; funcId < BLAS_FUNCTIONS_NUMBER; funcId++) { - - char *pRest = NULL; - BlasFunctionInfo *funcInfo = getBlasFunctionInfo( - &genInfo.targetDevice, - funcId ); - - if (funcInfo->envImplementation != NULL) { - const char *envImpl; - - envImpl = getenv(funcInfo->envImplementation); - if (envImpl != NULL) { - - envPattId = strtoul( envImpl, &pRest, 10 ); - //wrong value of env. variable AMD_CLBLAS_X_IMPLEMENTATION - if( 0 == strlen( envImpl ) || - pRest != envImpl + strlen(envImpl) ){ - - isEnvPattSelected = false; - } - else{ - - isEnvPattSelected = true; - } - } - else{ - - isEnvPattSelected = false; - } - - } - - // if pattern is selected by environment - tune it - // otherwise - start from the pattern number 0 - if( true == isEnvPattSelected ){ - pattId = envPattId; - } - else{ - pattId = 0; - } - - //logPattern( funcInfo->name ); - do - { - unsigned int nExtra; - unsigned int nTuneExtra = 0; - BlasPatternInfo * bPatt; - MemoryPattern* pattern; - - bPatt = &(funcInfo->pattInfo[pattId]); - pattern = &(funcInfo->pattern[pattId]); - - //if select a new trsm memory pattern (#3), then skip it - if ( funcId == CLBLAS_TRSM && pattId == 3) { - pattId++; - continue; - } - - for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) { - bool isRebuildRequired; - - BlasExtraInfo* bExtra; - bExtra = &(bPatt->extra[nExtra]); - genInfo.last = 0; - - // This evaluates whether the current combination of parameters from the given function should be tuned or not - // If skipFlags returns 1, then the this combination is skipped - // This checks for hardcoded combinations which are skipped because of known runtime bugs. - if ( skipFlags(bExtra, - pattId, - funcId, - &genInfo.deviceInfos, - !isEnvPattSelected ) ) { - continue; - } - - // Similar logic to skipFlags, but this mostly filters out cases that were specified on the command line - if (isFilter(bExtra, pattId, funcId)) { - continue; - } - logPattern( funcInfo->name ); - - calcExtraCount(nTuneExtra, bPatt->numTuneExtra); - nTuneExtra++; - - logDimension(funcInfo); - logExtra(bExtra); - - isRebuildRequired = isRebuild(bExtra); - - if (isRebuildRequired) { - size_t bestPatamSize = sizeof(GParam*)*DIMARRAYCOUNT; - - GParam* bestParam[DIMARRAYCOUNT]; - - memset(bestParam, 0, bestPatamSize); - - initParamsTime(bExtra); - - findBestParams( pattern, - funcId, - pattId, - isEnvPattSelected, - bExtra, - bestParam); - - saveBestParams(bExtra, bestParam); - - generateKernelForOthersFlag( bExtra, - bestParam, - pattern); - - deleteGParams(bExtra, bestParam); - } - checkDatas(bExtra, pattern); - } /* extra */ - //logEndPattern(funcId, pattId); - - pattId++; - /* patt */ - }while( false == isEnvPattSelected && - pattId < clblasSolvers[funcId].nrPatterns ); - - } /* func */ - } /* dev */ - destroyGenInfo(); -} - -void -parseArg(int argc, char* argv[]) -{ - - static char* help= "clblasTune - automatically tune the clblas " - "library for specific hardware.\n" - "\n" - "clblas function related parameters:\n" - " --gemm\n" - " Tune kernels for the GEMM function family.\n" - " --trmm\n" - " Tune kernels for the TRMM function family.\n" - " --trsm\n" - " Tune kernels for the TRSM function family.\n" - " --gemv\n" - " Tune kernels for the GEMV function family.\n" - " --symv\n" - " Tune kernels for the SYMV function family.\n" - " --syrk\n" - " Tune kernels for the SYRK function family.\n" - " --syr2k\n" - " Tune kernels for the SYR2K function family.\n" - "\n" - " You can specify the parameters of " - "several alternatives simultaneously.\n" - "\n" - " If any of these parameters is not specified the " - "tool tries kernels for all the functions.\n" - "\n" - " Used data types:\n" - " --float\n" - " Single precision version of functions.\n" - " --double\n" - " Double precision version of functions.\n" - " --complex\n" - " Single complex precision version of functions.\n" - " --double-complex\n" - " Double complex precision version of functions.\n" - "\n" - " You can specify the parameters of " - "several alternatives simultaneously.\n" - "\n" - " If any of these parameters is not specified the " - "tool tries kernels for all the data types.\n" - "\n" - "Management:\n" - " --fast\n" - " Using this option allows you to accelerate " - "tuning in up to 2-3 times. Achieving an optimal result " - "is not guaranteed.\n" - " --rebuild\n" - " Re-tuning the fastest OpenCL kernels. Can be " - "used after the driver update.\n" - " --store-kernels\n" - " Store found best kernels into a database file\n" - " WARNING! The file can be very large.\n" - "\n" - - ; - - - static char* args[] = { "--gemm", // 0 - "--trmm", // 1 - "--trsm", // 2 - "--buffers", // 3 - "--images", // 4 - "--float", // 5 - "--double", // 6 - "--complex", // 7 - "--double-complex", // 8 - "--store-kernels", // 9 - "--rebuild", // 10 -#if defined(_EXTENDED_TUNE_ARG) - "--e", // 11 - "--max", // 12 - "--extended-output", // 13 -#else - "", - "", - "", -#endif - "--gemv", // 14 - "--symv", // 15 - "--syrk", // 20 - "--syr2k", // 17 - "--fast", // 18 - "--caches", // 19 - "--help" // 20 - }; - int i; - unsigned int j; - bool isSetFunction = false; - bool isSetType = false; - - genInfo.aAll = true; - - for (i = 1; i < argc; ++i) { - char * arg = argv[i]; - bool b = true; - for (j = 0; j < sizeof(args)/sizeof(char*); ++ j){ - if (strcmp(arg, args[j]) == 0){ -#if defined(_EXTENDED_TUNE_ARG) - int argi = 0; -#endif - switch (j){ - case 0 : - genInfo.aFunc[CLBLAS_GEMM] = true; - isSetFunction = true; - break; - case 1 : - genInfo.aFunc[CLBLAS_TRMM] = true; - isSetFunction = true; - break; - case 2 : - genInfo.aFunc[CLBLAS_TRSM] = true; - isSetFunction = true; - break; - case 3 : genInfo.aPattern = 0; break; - case 4 : genInfo.aPattern = 1; break; - case 5 : - genInfo.aDType[TYPE_FLOAT] = true; - isSetType = true; - break; - case 6 : - genInfo.aDType[TYPE_DOUBLE] = true; - isSetType = true; - break; - case 7 : - genInfo.aDType[TYPE_COMPLEX_FLOAT] = true; - isSetType = true; - break; - case 8 : - genInfo.aDType[TYPE_COMPLEX_DOUBLE] = true; - isSetType = true; - break; - case 9 : genInfo.aIsKernel = true; break; - case 10: genInfo.aCommand = C_REBUILD; break; -#if defined(_EXTENDED_TUNE_ARG) - case 11: - i++; - argi = atoi(argv[i]); - genInfo.aFlag = argi; - break; - case 12: - i++; - argi = atoi(argv[i]); - genInfo.aMaxparam = argi; - break; - case 13: - genInfo.aExtendedOutput = true; - break; -#endif - case 14: - genInfo.aFunc[CLBLAS_GEMV] = true; - isSetFunction = true; - break; - case 15: - genInfo.aFunc[CLBLAS_SYMV] = true; - isSetFunction = true; - break; - case 16: - genInfo.aFunc[CLBLAS_SYRK] = true; - isSetFunction = true; - break; - case 17: - genInfo.aFunc[CLBLAS_SYR2K] = true; - isSetFunction = true; - break; - case 18: genInfo.aAll = false; break; - case 19: genInfo.aPattern = 2; break; - case 20: - printf ("%s", help); - exit(0); - break; - } - b = false; - } - } - if (b) { - fprintf(stdout, "Unknown argument %s\n", arg); - } - } - if (!isSetFunction) { - for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) { - genInfo.aFunc[i] = 1; - } - } - if (!isSetType) { - for (i=0; i < TYPE_NUMBER; ++i) { - genInfo.aDType[i] = 1; - } - } -} - -int -main(int argc, char* argv[]) -{ - FILE_PATH = getenv(ENV_FILE_PATH); - - // This clears and initializes the global GeneratorInfoRec genInfo struct - initGeneratorInfoRec( ); - parseArg(argc, argv); - - // This will - // Set up the global clblasSolvers for all function families supported within blas, including initializing memory patterns - // Identify all recognized devices in the system - clblasSetup(); - - if (!FILE_PATH){ - printf("The environment variable 'CLBLAS_STORAGE_PATH' is not defined\n"); - exit(EXIT_COD_NO_ENVIRONMENT_VARIABLE); - } - - logStream = stdout; - createFile(); - -#ifdef TEST_LOG - - int h = (int)(globalTime/1000/60/60); - int m = (int)(globalTime/1000/60) - h*60; - int c = (int)(globalTime/1000) - m*60 - h*60*60; - fprintf(logStream, " --> All time : %2d:%2d:%2d \n",h, m,c); - - h = (int)(globalFastTime/1000/60/60); - m = (int)(globalFastTime/1000/60) - h*60; - c = (int)(globalFastTime/1000) - m*60 - h*60*60; - fprintf(logStream, " --> Fast time : %2d:%2d:%2d \n",h, m,c); -#endif -} - -char* -getDeviceName(cl_device_id devID, int * status) -{ - char* devName; - size_t size; - *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, 0, NULL, &size); - checkErrorFunc("clGetDeviceInfo", *status); - - devName = malloc(size * sizeof(char)); - - *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, size, devName, NULL); - checkErrorFunc("clGetDeviceInfo", *status); - return devName; -} - |