diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/tests/t_tilemul.c')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/tests/t_tilemul.c | 1102 |
1 files changed, 0 insertions, 1102 deletions
diff --git a/external/clBLAS/src/library/blas/gens/tests/t_tilemul.c b/external/clBLAS/src/library/blas/gens/tests/t_tilemul.c deleted file mode 100644 index 4b4dd803..00000000 --- a/external/clBLAS/src/library/blas/gens/tests/t_tilemul.c +++ /dev/null @@ -1,1102 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -#ifdef __APPLE__ -#include <OpenCL/cl.h> -#else -#include <CL/cl.h> -#endif -#include <string.h> -#include <stdlib.h> -#include <assert.h> -#include <math.h> -#include <stdio.h> -#include <getopt.h> -#include <kerngen.h> -#include <blas_kgen.h> -#include <clblas_stddef.h> - -#define JUST_MULTIPLICATION 0 - -#if JUST_MULTIPLICATION -enum { - ITEM_WORK_M = 1, - ITEM_WORK_N = 1, - ITEM_BLOCKS_K = 1, -}; -#else -enum { - ITEM_WORK_M = 4, - ITEM_WORK_N = 4, - ITEM_BLOCKS_K = 3, - RAND_BOUND = 10 -}; -#endif - -const char *kernelName = "tilemul_test"; - - -// float types based unified pointer -typedef union FPtr { - void *v; - cl_float *f; - cl_double *d; - cl_float2 *f2; - cl_double2 *d2; -} FPtr; - -// float type based unified data type -typedef union FType { - unsigned char u[sizeof(cl_double)]; - cl_float f; - cl_float2 f2; - cl_double d; - cl_double2 d2; -} FType; - -static void -printUsage(const char *programName, int exitCode) -{ - printf( "USAGE: %s [options] <M N K>\n" - " --help, -h Print this help message.\n" - " --device, -d <device> OpenCL device used. <device> can " - "be \"gpu\" or \"cpu\". Default is \"gpu\".\n" - " --type, -t <type> Type can be s, d, c or z. Default " - "is s.\n" - " --fetch, -f <vector size> Size of used fetch vectors, in used " - "types. Default is 1.\n" - " --local, -l <matrix> If matrix is local or global. Matrix " - "can be A or B. By default, both are global.\n" - " --verbose, -v Turn on verbose mode.\n" - " --a, -a <order>\n" - " --b, -b <order>\n Set order for tiles a and b fetching. " - "Order can be are \"r\" for row major and \"c\" for " - "column major. Default values are \"r\" for A and \"c\" for B.\n" - " --skew, -s <skew_value> Set skews for tiles along M, N, and K " - "directions. skew_value can be \"a\" for tile A skew along M, \"b\"" - " for tile B skew along N and \"k\" for both tiles skew along K. " - "There is no skews by default.\n" - " -g, --globalcycling <global_cycling_value>\n" - " Set global cycling for tiles along M, " - "N and K directions. global_cycling_value can be \"a\" for tile A " - "global cycling along M, \"b\" for tile B global cycling along N " - "and \"k\" for both tiles global cycling along K. There is no " - "global cycling enabled by default.\n" - " --iter, -i <num> Number of iterations.\n" - " --core, -c <mulcore> Multiplier core. <mulcore> can " - "be \"muladd\", \"mad\" or \"dot\". Default is \"mad\".\n" - " --old, -o Use old tilemul generator interface " - "with one generator function call for both fetching and " - "multiplication. Separate generators functions are used by " - "default.\n" - " M N K Size of block.\n", - programName); - exit(exitCode); -} - -void -genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile) -{ - char tmp[1024]; - Kstring elem; - unsigned int incRows, incCols; - unsigned int i, j, v; - - if (!tile->trans) { - incRows = 1; - v = incCols = umin(tile->vecLen, tile->nrCols); - } - else { - v = incRows = umin(tile->vecLen, tile->nrRows); - incCols = 1; - } - - for (i = 0; i < tile->nrRows; i += incRows) { - for (j = 0; j < tile->nrCols; j += incCols) { - sprintfTileElement(&elem, tile, i, j, v); - sprintf(tmp, "%s = NAN;\n", elem.buf); - kgenAddStmt(ctx, tmp); - } - } - - kgenAddBlankLine(ctx); -} - -void -addTestPrefix(struct KgenContext *ctx, bool isDouble) -{ - kgenDeclareUptrs(ctx, isDouble); -} - -static void checkRet(int ret, const char *genName) -{ - if (ret != 0) { - printf("%s generator failed: %s\n", genName, strerror(-ret)); - exit(EXIT_FAILURE); - } -} - -void -genTest( - struct KgenContext *ctx, - BlasGenSettings *gset, - TileMulOpts *mulOpts, - bool separateFetch) -{ - char s[1024]; - Kstring kstr; - char *tName, tVect[64], *ptrName; - KernelVarNames *vnames = &gset->varNames; - DataType dtype = gset->kextra->dtype; - const SubproblemDim *subdims = gset->subdims; - unsigned int vecLen = gset->kextra->vecLen; - size_t m, n, k; - unsigned int i, j; - bool tra, trb, localA, localB, vecCoords; - int ret; - TileMulFlags flags = mulOpts->flags; - FetchOpts fetchOpts; - - m = gset->subdims[1].y; - n = gset->subdims[1].x; - k = gset->subdims[1].bwidth; - - tra = ((flags & TILEMUL_TRA) != 0); - trb = ((flags & TILEMUL_TRB) != 0); - localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); - localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); - - vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0); - - tVect[0] = '\0'; - - if (vecCoords && vecLen != 1) { - sprintf(tVect, "%u", vecLen); - } - - switch (dtype) { - case TYPE_FLOAT: - tName = "float"; - ptrName = "f"; - break; - case TYPE_DOUBLE: - tName = "double"; - ptrName = "d"; - break; - case TYPE_COMPLEX_FLOAT: - tName = "float2"; - ptrName = "f2v"; - break; - case TYPE_COMPLEX_DOUBLE: - tName = "double2"; - ptrName = "d2v"; - break; - default: - return; - } - - if (vecCoords) { - //Do not use GPtrs in fetching - vnames->A = "A"; - vnames->B = "B"; - } - else { - vnames->A = localA ? "LAptr" : "((GPtr)A)"; - vnames->B = localB ? "LBptr" : "((GPtr)B)"; - } - if (!localA) { - vnames->lda = "lda"; - - } - if (!localB) { - vnames->ldb = "ldb"; - } - vnames->sizeM = "M"; - vnames->sizeN = "N"; - vnames->sizeK = "K"; - vnames->skewA = "skewA"; - vnames->skewB = "skewB"; - vnames->skewK = "skewK"; - vnames->coordA = "workItemM"; - vnames->coordB = "workItemN"; - vnames->k = "k"; - - kgenAddBlankLine(ctx); - sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n", - ITEM_WORK_M, ITEM_WORK_N); - kgenAddStmt(ctx, s); - kgenAddStmt(ctx, "__kernel void\n"); - sprintf(s, "%s(\n", kernelName); - kgenAddStmt(ctx, s); - sprintf(s," %s alpha,\n", tName); - kgenAddStmt(ctx, s); - sprintf(s," __global %s%s *A,\n", tName, tVect); - kgenAddStmt(ctx, s); - sprintf(s," __global %s%s *B,\n", tName, tVect); - kgenAddStmt(ctx, s); - kgenAddStmt(ctx, " uint M,\n" - " uint N,\n" - " uint K,\n"); - sprintf(s, - " __global %s *C,\n" - " const uint iter)\n", tName); - kgenAddStmt(ctx, s); - kgenBeginFuncBody(ctx); - sprintf(s, "uint workItemM = %lu * get_global_id(0);\n" - "uint workItemN = %lu * get_global_id(1);\n", - m, n); - kgenAddStmt(ctx, s); - if ((flags & TILEMUL_SKEW_A) != 0) { - kgenAddStmt(ctx, "uint skewA = 0u;\n"); - } - if ((flags & TILEMUL_SKEW_B) != 0) { - kgenAddStmt(ctx, "uint skewB = 0u;\n"); - } - if ((flags & TILEMUL_SKEW_K) != 0) { - kgenAddStmt(ctx, "uint skewK = 0u;\n"); - } - - if (localA) { - sprintf(s, "__local %s LA[%lu];\n", - tName, subdims[0].bwidth * subdims[0].y); - kgenAddStmt(ctx, s); - } - else { //global A - sprintf(s, "uint lda = %s;\n", tra ? "M" : "K"); - kgenAddStmt(ctx, s); - } - if (localB) { - sprintf(s, "__local %s LB[%lu];\n", - tName, subdims[0].bwidth * subdims[0].x); - kgenAddStmt(ctx, s); - } - else { //global B - sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N"); - kgenAddStmt(ctx, s); - } - - initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY); - declareTileStorages(ctx, gset); - - if (vecCoords) { - size_t ha, hb; - char *str; - - ha = tra ? k : m; - hb = trb ? n : k; - - if (ha > 1) { - str = s; - str += sprintf(str, "uint%lu ca = {0", ha); - for (i = 1; i < ha; i++) { - str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen); - } - str += sprintf(str, "};\n"); - kgenAddStmt(ctx, s); - } - else { - kgenAddStmt(ctx, "uint ca = 0;\n"); - } - vnames->vectCoordA = "ca"; - - if (hb > 1) { - str = s; - str += sprintf(str, "uint%lu cb = {0", hb); - for (i = 1; i < hb; i++) { - str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen); - } - str += sprintf(str, "};\n"); - kgenAddStmt(ctx, s); - } - else { - kgenAddStmt(ctx, "uint cb = 0;\n"); - } - vnames->vectCoordB = "cb"; - -// uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3}; -// uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3}; - } - - kgenAddBlankLine(ctx); - - sprintf(s, "for (int it = 0; it < iter; it++)"); - kgenBeginBranch(ctx, s); - - if (!(localA && localB)) { - kgenAddStmt(ctx, "uint k = 0;\n"); - } - - genZeroTile(ctx, &gset->tileCY); - - if (vecCoords) { - char *coordsA[2] = {"workItemM", "k"}; - char *coordsB[2] = {"k", "workItemN"}; - sprintf(s, "A += %s * (lda / %u) + %s / %u;\n", - coordsA[tra], vecLen, coordsA[1 - tra], vecLen); - kgenAddStmt(ctx, s); - sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n", - coordsB[trb], vecLen, coordsB[1 - trb], vecLen); - kgenAddStmt(ctx, s); - } - - sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); - kgenBeginBranch(ctx, s); - - /* Copy data to local memory. We know that the size of matrix is the same - * that the size of one block and use that. - */ - if (localA) { - sprintf(s, - "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n" - "wait_group_events(1, &evA);\n" - "barrier(CLK_LOCAL_MEM_FENCE);\n", - subdims[0].y * subdims[0].bwidth); - kgenAddStmt(ctx, s); - kgenAddStmt(ctx, "LPtr LAptr;\n"); - if (tra) { - sprintf(s, - "LAptr.%s = LA + workItemM;\n", ptrName); - } - else { - sprintf(s, - "LAptr.%s = LA + workItemM * %lu;\n", - ptrName, subdims[0].bwidth); - } - kgenAddStmt(ctx, s); - } - if (localB) { - sprintf(s, - "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n" - "wait_group_events(1, &evB);\n" - "barrier(CLK_LOCAL_MEM_FENCE);\n", - subdims[0].x * subdims[0].bwidth); - kgenAddStmt(ctx, s); - kgenAddStmt(ctx, "LPtr LBptr;\n"); - if (trb) { - sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n", - ptrName, subdims[0].bwidth); - } - else { - sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName); - } - kgenAddStmt(ctx, s); - } - - if (!separateFetch) { - ret = tileMulGen(ctx, gset, mulOpts); - checkRet(ret, "Multiplier"); - } - else { - Tile *tileA = &gset->tileA; - Tile *tileB = &gset->tileBX; - - memset(&fetchOpts, 0, sizeof(fetchOpts)); - if (localA) { - fetchOpts.memA = CLMEM_LOCAL_MEMORY; - } - if (localB) { - fetchOpts.memB = CLMEM_LOCAL_MEMORY; - } - - genFillTileWithNAN(ctx, tileA); - genFillTileWithNAN(ctx, tileB); - - if (subdims[0].bwidth != subdims[1].bwidth) { - sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", - subdims[0].bwidth, k); - kgenBeginBranch(ctx, s); - } - -#if JUST_MULTIPLICATION - for (i = 0; i < tileA->nrRows; i++) { - for(j = 0; j < tileA->nrCols; j++) { - sprintfTileElement(&kstr, tileA, i, j, 1); - sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j); - kgenAddStmt(ctx, s); - } - } - - for (i = 0; i < tileB->nrRows; i++) { - for(j = 0; j < tileB->nrCols; j++) { - sprintfTileElement(&kstr, tileB, i, j, 1); - sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j); - kgenAddStmt(ctx, s); - } - } -#else - fetchOpts.mrole = MATRIX_B; - fetchOpts.lineOffset = 0; - fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows; - ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); - checkRet(ret, "Fetching tile b"); - - fetchOpts.mrole = MATRIX_A; - fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows; - kgenAddBlankLine(ctx); - fetchOpts.lineOffset = 0; - ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); - checkRet(ret, "Fetching tile a"); -#endif - ret = genMulTiles(ctx, gset, mulOpts); - checkRet(ret, "Multiplier"); -#if ! JUST_MULTIPLICATION - sprintf(s, "k += %lu;\n", k); - kgenAddStmt(ctx, s); -#endif - if (subdims[0].bwidth != subdims[1].bwidth) { - kgenEndBranch(ctx, NULL); - } - } - kgenEndBranch(ctx, NULL); // K loop - kgenEndBranch(ctx, NULL); // iterations loop - - kgenAddBlankLine(ctx); - - for (i = 0; i < m; i++) { - for (j = 0; j < n; j++) { - sprintfTileElement(&kstr, &gset->tileCY, i, j, 1); - sprintf(s, - "((GPtr)C).%s" - "[(%d + workItemM) * N + %d + workItemN] = %s;\n", - ptrName, i, j, kstr.buf); - kgenAddStmt(ctx, s); - } - } - - kgenEndFuncBody(ctx); -} - -cl_int -run ( - const char *ker, - cl_uint M, - cl_uint N, - cl_uint K, - FType alpha, - BlasGenSettings *gset, - TileMulFlags flags, - cl_device_type deviceType, - bool verbose, - unsigned int iterNum) -{ - cl_int err; - cl_platform_id platform; - cl_context ctx; - cl_device_id device; - cl_command_queue queue; - cl_event evt; - DataType dtype = gset->kextra->dtype; - - cl_mem bufA, bufB, bufC; - FPtr A, B, C, C_naive; - bool isComplex = isComplexType(dtype); - bool isDouble = isDoubleBasedType(dtype); - cl_uint nwords = (isComplex) ? 2 : 1; - unsigned int tsize = dtypeSize(dtype); - cl_kernel kernel; - size_t i, j, k; - size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; - size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; - char log[100000]; - size_t logSize; - cl_long sTime, fTime; - cl_program program = NULL; - - clGetPlatformIDs(1, &platform, NULL); - - clGetDeviceIDs(platform, deviceType, 1, &device, NULL); - - ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); - if (err != CL_SUCCESS) { - return err; - } - - queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); - if (err != CL_SUCCESS) { - return err; - } - - /* Prepare OpenCL kernel and its arguments */ - - program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL); - - err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); - clGetProgramBuildInfo (program, - device, - CL_PROGRAM_BUILD_LOG, - sizeof(log), - log, - &logSize); - printf("%s", log); - if (err != CL_SUCCESS){ - clReleaseProgram(program); - return err; - } - - kernel = clCreateKernel(program, kernelName, &err); - if (err != CL_SUCCESS){ - clReleaseProgram(program); - return err; - } - /* Memory allocation */ - - A.v = malloc(M * K * tsize); - B.v = malloc(K * N * tsize); - C.v = malloc(M * N * tsize); - C_naive.v = malloc(M * N * tsize); - -#if JUST_MULTIPLICATION - srand(0); - if (isDouble) { - for(i = 0; i < M * K * nwords; i++){ - A.d[i] = i; - } - for(i = 0; i < N * K * nwords; i++){ - B.d[i] = i + 7; - } - for(i = 0; i < M * N * nwords; i++){ - C.d[i] = 0.0; - C_naive.d[i] = 0.0; - } - } - else { - for(i = 0; i < M * K * nwords; i++){ - A.f[i] = i; - } - for(i = 0; i < N * K * nwords; i++){ - B.f[i] = i + 7; - } - for(i = 0; i < M * N * nwords; i++){ - C.f[i] = 0.0; - C_naive.f[i] = 0.0; - } - } - -#else - srand(0); - if (isDouble) { - for(i = 0; i < M * K * nwords; i++){ - A.d[i] = (double)(rand() % RAND_BOUND); - } - for(i = 0; i < N * K * nwords; i++){ - B.d[i] = (double)(rand() % RAND_BOUND); - } - for(i = 0; i < M * N * nwords; i++){ - C.d[i] = 0.0; - C_naive.d[i] = 0.0; - } - } - else { - for(i = 0; i < M * K * nwords; i++){ - A.f[i] = (float)(rand() % RAND_BOUND); - } - for(i = 0; i < N * K * nwords; i++){ - B.f[i] = (float)(rand() % RAND_BOUND); - } - for(i = 0; i < M * N * nwords; i++){ - C.f[i] = 0.0; - C_naive.f[i] = 0.0; - } - } -#endif - - bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - K * M * tsize, A.v, &err); - if (err != CL_SUCCESS) { - clReleaseKernel(kernel); - return err; - } - - bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - K * N * tsize, B.v, &err); - - if (err != CL_SUCCESS) { - clReleaseMemObject(bufA); - clReleaseKernel(kernel); - return err; - } - - bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, - M * N * tsize, C.v, &err); - - if (err != CL_SUCCESS) { - clReleaseMemObject(bufB); - clReleaseMemObject(bufA); - clReleaseKernel(kernel); - return err; - } - - /* Argument setting and kernel execution */ - err = clSetKernelArg(kernel, 0, tsize, alpha.u); - err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA); - err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB); - err |= clSetKernelArg(kernel, 3, sizeof(M), &M); - err |= clSetKernelArg(kernel, 4, sizeof(N), &N); - err |= clSetKernelArg(kernel, 5, sizeof(K), &K); - err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC); - err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum); - - if (err != CL_SUCCESS) { - clReleaseMemObject(bufC); - clReleaseMemObject(bufB); - clReleaseMemObject(bufA); - clReleaseKernel(kernel); - return err; - } - - err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, - globalWorkSize, localWorkSize, 0, - NULL, &evt); - - if (err != CL_SUCCESS) { - clReleaseMemObject(bufC); - clReleaseMemObject(bufB); - clReleaseMemObject(bufA); - clReleaseKernel(kernel); - return err; - } - - err = clFinish(queue); - err = clEnqueueReadBuffer (queue, - bufC, - CL_TRUE, - 0, - M * N * tsize, - C.v, - 0, - NULL, - NULL); - - /* Naive CPU multiplication */ - if (isDouble) { - for (i = 0; i < M; i++) { - for (j = 0; j < N; j++) { - if (isComplex) { - cl_double2 val; - for (k = 0; k < K; k++) { - cl_double2 bkj = flags & TILEMUL_TRB ? - B.d2[j * K + k] : B.d2[k * N + j]; - cl_double2 aik = flags & TILEMUL_TRA ? - A.d2[k * M + i] : A.d2[i * K + k]; - val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; - val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; - C_naive.d2[i * N + j].s[0] += val.s[0]; - C_naive.d2[i * N + j].s[1] += val.s[1]; - } - val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] - - C_naive.d2[i * N + j].s[1] * alpha.d2.s[1]; - val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] + - C_naive.d2[i * N + j].s[1] * alpha.d2.s[0]; - C_naive.d2[i * N + j] = val; - } - else { - for (k = 0; k < K; k++) { - double bkj = flags & TILEMUL_TRB ? - B.d[j * K + k] : B.d[k * N + j]; - double aik = flags & TILEMUL_TRA ? - A.d[k * M + i] : A.d[i * K + k]; - C_naive.d[i * N + j] += aik * bkj; - } - C_naive.d[i * N + j] *= alpha.d; - } - } - } - - for (i = 0; i < M * N; i++) { - if (C.d[i] != C_naive.d[i]) { - printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, - C.d[i], C_naive.d[i]); - break; - } - } - if (i == M * N) { - printf("Match\n"); - } - } - else { - for (i = 0; i < M; i++) { - for (j = 0; j < N; j++) { - if (isComplex) { - cl_float2 val; - for (k = 0; k < K; k++) { - cl_float2 bkj = flags & TILEMUL_TRB ? - B.f2[j * K + k] : B.f2[k * N + j]; - cl_float2 aik = flags & TILEMUL_TRA ? - A.f2[k * M + i] : A.f2[i * K + k]; - val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; - val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; - C_naive.f2[i * N + j].s[0] += val.s[0]; - C_naive.f2[i * N + j].s[1] += val.s[1]; - } - val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] - - C_naive.f2[i * N + j].s[1] * alpha.f2.s[1]; - val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] + - C_naive.f2[i * N + j].s[1] * alpha.f2.s[0]; - C_naive.f2[i * N + j] = val; - } - else { - for (k = 0; k < K; k++) { - float bkj = flags & TILEMUL_TRB ? - B.f[j * K + k] : B.f[k * N + j]; - float aik = flags & TILEMUL_TRA ? - A.f[k * M + i] : A.f[i * K + k]; - C_naive.f[i * N + j] += aik * bkj; - } - C_naive.f[i * N + j] *= alpha.f; - } - } - } - - for (i = 0; i < M * N; i++) { - if (C.f[i] != C_naive.f[i]) { - printf("Differ at (%lu, %lu): %lf != %lf\n", - i / N, i % N, C.f[i], C_naive.f[i]); - break; - } - } - if (i == M * N) { - printf("Match\n"); - } - } - - /* End of naive CPU multiplication */ - if (verbose) { - if (!isDouble) { - printf("Matrix A:\n"); - for (i = 0; i < M; i++) { - for (k = 0; k < K; k++) { - if (isComplex) { - cl_float2 aik = flags & TILEMUL_TRA ? - A.f2[k * M + i] : A.f2[i * K + k]; - printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]); - } - else { - float aik = flags & TILEMUL_TRA ? - A.f[k * M + i] : A.f[i * K + k]; - printf("%4.1f ", aik); - } - } - printf("\n"); - } - - printf("Matrix B:\n"); - for (k = 0; k < K; k++) { - for (j = 0; j < N; j++) { - if (isComplex) { - cl_float2 bkj = flags & TILEMUL_TRB ? - B.f2[j * K + k] : B.f2[k * N + j]; - printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]); - } - else { - float bkj = flags & TILEMUL_TRB ? - B.f[j * K + k] : B.f[k * N + j]; - printf("%4.1f ", bkj); - } - } - printf("\n"); - } - - printf("CPU calculated matrix:\n"); - for (i = 0; i < M; i++) { - for (j = 0; j < N; j++) { - if (isComplex) { - printf("(%4.1f, %4.1f) ", - C_naive.f2[i * N + j].s[0], - C_naive.f2[i * N + j].s[1]); - } - else { - printf("%4.1f ", C_naive.f[i * N + j]); - } - } - printf("\n"); - } - - printf("GPU calculated matrix:\n"); - for (i = 0; i < M; i++) { - for (j = 0; j < N; j++) { - if (isComplex) { - printf("(%4.1f, %4.1f) ", - C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]); - } - else { - printf("%4.1f ", C.f[i * N + j]); - } - } - printf("\n"); - } - } - } - - clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), - &sTime, NULL); - clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), - &fTime, NULL); - - printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n", - (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum)); - - clReleaseMemObject(bufC); - clReleaseMemObject(bufB); - clReleaseMemObject(bufA); - clReleaseKernel(kernel); - return CL_SUCCESS; -} - -int main(int argc, char *argv[]) -{ - char out[1024*1024]; - CLBLASKernExtra kextra; - BlasGenSettings gset; - TileMulOpts mulOpts; - int i; - cl_uint blockM = 4, blockN = 4, blockK = 8; - struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1); - FType alpha; - cl_int err; - unsigned int iterNum = 1; - const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov"; - const struct option longOptions[] = { - {"help", no_argument, NULL, 'h'}, - {"device", required_argument, NULL, 'd'}, - {"fetch", required_argument, NULL, 'f'}, - {"local", required_argument, NULL, 'l'}, - {"type", required_argument, NULL, 't'}, - {"a", required_argument, NULL, 'a'}, - {"b", required_argument, NULL, 'b'}, - {"skew", required_argument, NULL, 's'}, - {"globalcycling", required_argument, NULL, 'g'}, - {"iter", required_argument, NULL, 'i'}, - {"core", required_argument, NULL, 'c'}, - {"old", no_argument, NULL, 'o'}, - {"verbose", no_argument, NULL, 'v'}, - {NULL, 0, NULL, 0} - }; - int nextOption; - cl_device_type deviceType = CL_DEVICE_TYPE_GPU; - bool verbose = false; - SubproblemDim *subdims = gset.subdims; - bool separateFetch = false; - - memset(&gset, 0, sizeof(gset)); - memset(&mulOpts, 0, sizeof(mulOpts)); - memset(&kextra, 0, sizeof(kextra)); - gset.kextra = &kextra; - gset.flags |= BGF_WHOLE_A; - mulOpts.core = TILEMUL_MAD; - mulOpts.flags = TILEMUL_FORCE_VECTORIZATION; - kextra.vecLen = 1; - kextra.dtype = TYPE_FLOAT; - - alpha.f = 1; - - // parse command line - do { - nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL); - switch (nextOption) { - case 'h': - printUsage(argv[0], EXIT_SUCCESS); - break; - case 'd': - if (!strcmp("cpu", optarg)) { - deviceType = CL_DEVICE_TYPE_CPU; - } - else if (!strcmp("gpu", optarg)) { - deviceType = CL_DEVICE_TYPE_GPU; - } - else { - printf("Unknown device type %s. Supported values are \"cpu\" " - "and \"gpu\".\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'f': - kextra.vecLen = atoi(optarg); - break; - case 'l': - if (!strcmp(optarg, "A")) { - mulOpts.memA = CLMEM_LOCAL_MEMORY; - } - else if (!strcmp(optarg, "B")) { - mulOpts.memB = CLMEM_LOCAL_MEMORY; - } - else { - printf("Wrong matrix specified: %s. Supported values are " - "A, B.\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 't': - if (!strcmp(optarg, "s")) { - kextra.dtype = TYPE_FLOAT; - alpha.f = 1; - } - else if (!strcmp(optarg, "d")) { - kextra.dtype = TYPE_DOUBLE; - alpha.d = 1; - } - else if (!strcmp(optarg, "c")) { - kextra.dtype = TYPE_COMPLEX_FLOAT; - alpha.f2.s[0] = 1; - alpha.f2.s[1] = 0; - } - else if (!strcmp(optarg, "z")) { - kextra.dtype = TYPE_COMPLEX_DOUBLE; - alpha.d2.s[0] = 1; - alpha.d2.s[1] = 0; - } - else { - printf("Wrong type specified: %s. Supported values are " - "s, d, c, z.\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'a': - if (!strcmp(optarg, "r")) { - mulOpts.flags &= ~TILEMUL_TRA; - } - else if (!strcmp(optarg, "c")) { - mulOpts.flags |= TILEMUL_TRA; - } - else { - printf("Wrong tile a parameter specified: %s. Supported values " - "are \"r\", \"c\".\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'b': - if (!strcmp(optarg, "r")) { - mulOpts.flags &= ~TILEMUL_TRB; - } - else if (!strcmp(optarg, "c")) { - mulOpts.flags |= TILEMUL_TRB; - } - else { - printf("Wrong tile b order specified: %s. Supported values " - "are \"r\", \"c\".\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 's': - if (!strcmp(optarg, "a")) { - mulOpts.flags |= TILEMUL_SKEW_A; - } - else if (!strcmp(optarg, "b")) { - mulOpts.flags |= TILEMUL_SKEW_B; - } - else if (!strcmp(optarg, "k")) { - mulOpts.flags |= TILEMUL_SKEW_K; - } - else { - printf("Wrong skew parameter specified: %s. Supported values " - "are \"a\", \"b\", \"k\"\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'g': - if (!strcmp(optarg, "a")) { - mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; - } - else if (!strcmp(optarg, "b")) { - mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; - } - else if (!strcmp(optarg, "k")) { - mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; - } - else { - printf("Wrong global cycling parameter specified: %s. " - "Supported values are \"a\", \"b\", \"k\"\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'i': - iterNum = atoi(optarg); - break; - case 'c': - if (!strcmp("muladd", optarg)) { - mulOpts.core = TILEMUL_MULADD; - } - else if (!strcmp("mad", optarg)) { - mulOpts.core = TILEMUL_MAD; - } - else if (!strcmp("dot", optarg)) { - mulOpts.core = TILEMUL_DOT; - } - else { - printf("Unknown multiplier core %s. Supported values" - " are \"muladd\", \"mad\" and \"dot\".\n", optarg); - exit(EXIT_FAILURE); - } - break; - case 'o': - separateFetch = false; - break; - case 'v': - verbose = true; - break; - case -1: - break; - default: - printUsage(argv[0], EXIT_FAILURE); - break; - } - } while (nextOption != -1); - - if (optind + 2 >= argc) { - printf("Error: Not all sizes are specified\n"); - printUsage(argv[0], EXIT_FAILURE); - } - blockM = atoi(argv[optind]); - blockN = atoi(argv[optind + 1]); - blockK = atoi(argv[optind + 2]); - - if ((mulOpts.memA == CLMEM_LOCAL_MEMORY || - mulOpts.memB == CLMEM_LOCAL_MEMORY) && - ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) { - printf("One of matrixes is in local memory, " - "disabling global cycling\n"); - mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC; - } - - if (mulOpts.flags & TILEMUL_TRA) { - kextra.flags |= KEXTRA_TRANS_A; - } - if (mulOpts.flags & TILEMUL_TRB) { - kextra.flags |= KEXTRA_TRANS_B; - } - - subdims[0].y = blockM * ITEM_WORK_M; - subdims[0].x = blockN * ITEM_WORK_N; - subdims[0].bwidth = blockK * ITEM_BLOCKS_K; - subdims[1].y = blockM; - subdims[1].x = blockN; - subdims[1].bwidth = blockK; - - memset(out, 0, sizeof(out)); - - i = isDoubleBasedType(kextra.dtype); - kgenDeclareUptrs(ctx, i); - genTest(ctx, &gset, &mulOpts, separateFetch); - destroyKgenContext(ctx); - - printf("Kernel code: \n\"%s\"\n", out); - err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha, - &gset, mulOpts.flags, deviceType, verbose, iterNum); - if (err != CL_SUCCESS) { - printf("Test run failed, error %d\n", err); - return EXIT_FAILURE; - } - return EXIT_SUCCESS; -} |