summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/generic/solution_seq_make.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/generic/solution_seq_make.c')
-rw-r--r--external/clBLAS/src/library/blas/generic/solution_seq_make.c2367
1 files changed, 0 insertions, 2367 deletions
diff --git a/external/clBLAS/src/library/blas/generic/solution_seq_make.c b/external/clBLAS/src/library/blas/generic/solution_seq_make.c
deleted file mode 100644
index 8a5e402d..00000000
--- a/external/clBLAS/src/library/blas/generic/solution_seq_make.c
+++ /dev/null
@@ -1,2367 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-#include <stdio.h>
-#include <assert.h>
-#include <sys/types.h>
-#include <math.h>
-#include <stdlib.h>
-
-#include <clblas_stddef.h>
-#include <clblas-internal.h>
-#include <toolslib.h>
-#include <events.h>
-
-#include "matrix_dims.h"
-#include "solution_assert.h"
-#include "solution_seq.h"
-
-#define DECOMPOSITION_THRESHOLD(type) (2560 * sizeof(cl_float) / dtypeSize(type))
-
-/* From solution_seq.c */
-bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
-void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
-
-#define isMatrixCached(pattern, mrole) \
- checkMatrixMemLevelSet(pattern, mrole, (CLMEM_LEVEL_L2 | CLMEM_LEVEL_L1))
-
-#define isLdsUsed(pattern) \
- (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) || \
- checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS))
-
-enum {
- DEFAULT_BUFS_LSIZE_0 = 8,
- DEFAULT_BUFS_LSIZE_1 = 8,
- DEFAULT_CACHED_BUFS_LSIZE_0 = 8,
- DEFAULT_CACHED_BUFS_LSIZE_1 = 8
-};
-
-static cl_uint getQueueMaxImages(cl_command_queue queue);
-
-static bool checkMatrixMemLevelSet(MemoryPattern *pattern, MatrixRole mrole,
- meml_set_t mask);
-
-static void stripeDivision(BlasFunctionID funcID, const CLBlasKargs *args,
- ListHead *seq, cl_uint totalCUs);
-static void rectDivision(BlasFunctionID funcID, const CLBlasKargs *args,
- ListHead *seq, cl_uint totalCUs);
-static void triMatrixStripeDivision(BlasFunctionID funcID,
- const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs);
-
-static cl_bool findBestPattern(SolutionStep *step);
-
-static void getDefaultStepGranulation(SolutionStep *step);
-static bool avoidLoadFromStorage(SolutionStep *step);
-
-static bool getStepResources(SolutionStep *step);
-static void getSuitableImageSizes(size_t *minWidth, size_t *minHeight,
- size_t *bestHeight, MatrixRole mrole, CLBlasKargs *kargs, unsigned int vecLen,
- SubproblemDim *subdims);
-
-static ListNode* decomposeTRXMStep(SolutionStep *step);
-static ListNode* decomposeSYRKStep(SolutionStep *step);
-static ListNode* decomposeSYR2KStep(SolutionStep *step);
-
-// Find vector length which lda and tile width is divisible on
-unsigned int
-appropriateVecLen(size_t ld, unsigned int tsize, size_t twidth, int funcLevel)
-{
- unsigned int vlen = sizeof(cl_float4) / tsize;
-
- if (funcLevel == 3) {
- vlen *= 2;
- }
- while (vlen > twidth) {
- vlen /= 2;
- }
-
- while ((ld % vlen) || (twidth % vlen)) {
- vlen /= 2;
- }
-
- return vlen;
-}
-
-/*
- * Select an appropriate vectorization to perform computation with.
- * It's done based upon the problem sizes and device type. The device type
- * is taken into account as well since not all devices allow not aligned
- * access to vector data.
- */
-
-cl_int
-selectVectorization(
- const SolutionStep *step,
- CLBLASKernExtra *kextra)
-{
- const TargetDevice *device = &step->device;
- cl_device_type devType;
- cl_int err;
- size_t tw;
- bool tra;
- size_t checkedSizes[3];
- int i, j;
- const CLBlasKargs *kargs = &step->args;
- KernelExtraFlags kflags = kextra->flags;
- KernelExtraFlags vecFlags[3] = { KEXTRA_NO_COPY_VEC_A, KEXTRA_NO_COPY_VEC_B,
- KEXTRA_NO_COPY_VEC_C };
- unsigned int vlen;
- unsigned int tsize;
- MemoryPattern *mempat;
- const SubproblemDim *dim = &step->subdims[1];
- int funcLevel;
-
- mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
- err = clGetDeviceInfo(device->id, CL_DEVICE_TYPE, sizeof(devType),
- &devType, NULL);
- if (err != CL_SUCCESS) {
- return err;
- }
-
- if (isLdsUsed(mempat)) {
- kextra->vecLenC = kextra->vecLen = sizeof(cl_float4) /
- dtypeSize(step->args.dtype);
- kextra->vecLenA = kextra->vecLenB = kextra->vecLen;
- }
- else {
- kextra->vecLenA = kextra->vecLenB = 0;
- }
-
- // select vectorization based upon leading dimensions and starting offsets
- for (i = 0; i < 2; i++) {
- if (!i) {
- // check by leading dimensions
- checkedSizes[0] = kargs->lda.matrix;
- if (funcBlasLevel(step->funcID) == 2) {
- checkedSizes[1] = checkedSizes[2] = 0;
- }
- else {
- checkedSizes[1] = kargs->ldb.matrix;
- checkedSizes[2] = kargs->ldc.matrix;
- }
- }
- else {
- // check by offsets
- checkedSizes[0] = kargs->offA;
- checkedSizes[1] = kargs->offBX;
- checkedSizes[2] = kargs->offCY;
- }
-
- if (funcHasTriangMatrix(step->funcID)) {
- checkedSizes[2] = checkedSizes[1];
- }
-
- vlen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
-
- /*
- * Disable vectorization at load from the global memory to LDS
- * if matrix width is not aligned on the boundary of the float4
- */
- for (j = 0; j < 3; j++) {
- if (checkedSizes[j] % vlen) {
- kflags |= vecFlags[j];
- }
- }
-
- if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV))
- {
- if ( ( ((kflags & KEXTRA_UPPER_TRIANG)==0) && (kflags & KEXTRA_COLUMN_MAJOR) ) ||
- ( ((kflags & KEXTRA_UPPER_TRIANG)) && ((kflags & KEXTRA_COLUMN_MAJOR) == 0)) )
-
- {
- if( (kargs->N) % vlen)
- {
- kflags |= KEXTRA_NO_COPY_VEC_A;
- }
- }
- }
-
- if(mempat->sops->selectVectorization != NULL)
- {
- kflags |= mempat->sops->selectVectorization((void *)kargs, vlen);
- }
-
- if ((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
- {
- //
- // TRTRI, GEMV Part - Only Scalar loads
- // PENDING:
- // Analyze Case by Case and selectively enable/disable
- //
- kflags |= KEXTRA_NO_COPY_VEC_A;
- kflags |= KEXTRA_NO_COPY_VEC_B;
- }
-
- //
- // Routines that Use LDS should be above this IF statement
- //
- if (isLdsUsed(mempat)) {
- continue;
- }
-
- //
- // Routines that dont use LDS have to be below the isLdsUsed() code
- //
- if (step->funcID == CLBLAS_GEMM2)
- {
- if ((step->subdims[0].y > step->args.M) || (step->subdims[0].x > step->args.N))
- {
- kextra->vecLen = 1;
- } else {
- kextra->vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
- }
- kextra->vecLenA = kextra->vecLen;
- kextra->vecLenB = kextra->vecLen;
- kextra->vecLenC = kextra->vecLen;
- continue;
- }
-
- if (step->funcID == CLBLAS_GEMM_TAIL)
- {
- kextra->vecLen = 1;
- kextra->vecLenA = 1;
- kextra->vecLenB = 1;
- kextra->vecLenC = 1;
- continue;
- }
- funcLevel = funcBlasLevel(step->funcID);
- funcLevel = funcBlasLevel(step->funcID);
-
- /*
- * If the step's pattern uses LDS, it is responsible for alignment.
- * Otherwise it's needed to provide appropriate vector length
- */
- tsize = dtypeSize(step->args.dtype);
- tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_A);
- tw = (tra) ? dim->y : dim->bwidth;
- vlen = appropriateVecLen(checkedSizes[0], tsize, tw, funcLevel);
- kextra->vecLenA = (kextra->vecLenA) ? umin(kextra->vecLenA, vlen) :
- vlen;
-
- tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_B);
- tw = ((funcLevel == 2) || !tra) ? dim->bwidth : dim->x;
- vlen = appropriateVecLen(checkedSizes[1], tsize, tw, funcLevel);
- kextra->vecLenB = (kextra->vecLenB) ? umin(kextra->vecLenB, vlen) :
- vlen;
-
- tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_C );
- tw = ((funcLevel == 2) || tra) ? dim->y : dim->x;
- vlen = appropriateVecLen( checkedSizes[2],
- tsize,
- tw,
- funcLevel );
- kextra->vecLenC = kextra->vecLenC ? umin(vlen,kextra->vecLenC) :
- vlen;
-
- kextra->vecLen = umin(kextra->vecLenA, kextra->vecLenB);
- kextra->vecLen = umin(kextra->vecLenC, kextra->vecLen);
- }
-
- kextra->flags = kflags;
-
- return CL_SUCCESS;
-}
-
-/*
- * Replace 'offsetM' and 'offsetN' field with respective extra offset at
- * 'offA', 'offBX', 'offCY' and taking into accoutn offset along K
- */
-void VISIBILITY_HIDDEN
-fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK)
-{
- if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) {
- kargs->offA += offsetK * kargs->lda.matrix + kargs->offsetM;
- }
- else {
- kargs->offA += kargs->offsetM * kargs->lda.matrix + offsetK;
- }
- if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) {
- kargs->offBX += offsetK * kargs->ldb.matrix + kargs->offsetN;
- }
- else {
- kargs->offBX += kargs->offsetN * kargs->ldb.matrix + offsetK;
- }
- if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_C)) {
- kargs->offCY += kargs->offsetN * kargs->ldc.matrix + kargs->offsetM;
- }
- else {
- kargs->offCY += kargs->offsetM * kargs->ldc.matrix + kargs->offsetN;
- }
- kargs->offsetM = kargs->offsetN = 0;
-}
-
-ListNode
-*decomposeProblemStep(SolutionStep *step)
-{
- ListNode *node;
-
- switch (step->funcID) {
- case CLBLAS_TRMM:
- case CLBLAS_TRSM:
- node = decomposeTRXMStep(step);
- break;
- case CLBLAS_SYRK:
- node = decomposeSYRKStep(step);
- break;
- case CLBLAS_SYR2K:
- node = decomposeSYR2KStep(step);
- break;
- default:
- node = &step->node;
- break;
- }
-
- return node;
-}
-
-cl_int
-makeSolutionSeq(
- BlasFunctionID funcID,
- const CLBlasKargs *args,
- cl_uint numCommandQueues,
- cl_command_queue *commandQueues,
- cl_uint numEventsInWaitList,
- const cl_event *eventWaitList,
- cl_event *events,
- ListHead *seq)
-{
- cl_int err;
- cl_uint j, totalCUs, numDevicesWithoutDoubles;
- bool hasDouble;
- SolutionStep *step;
- CLBLASKernExtra extra;
- ListNode *i;
- MemoryPattern *pattern;
- solver_id_t sid;
- KernelKey key;
- bool need[MAX_CLBLAS_KERNELS_PER_STEP] = {true};
- CLBlasKernelType ktype;
- Kernel *kernel;
- bool loadData = false;
- unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP];
- size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP];
- char bopts[BUILD_OPTS_MAXLEN]; // Moving bopts up. See the comments before findKernel()
- int ik;
- // first subdimension index in the subproblem dims array
- int firstDimIdx;
-
- if ((numCommandQueues == 0) || (commandQueues == NULL)) {
- return CL_INVALID_VALUE;
- }
-
- memset(buffer, 0, sizeof(buffer));
- listInitHead(seq);
-
- totalCUs = 0;
- numDevicesWithoutDoubles = 0;
- for (j = 0; j < numCommandQueues; j++) {
- cl_device_id devID;
-
- err = getQueueDevice(commandQueues[j], &devID);
- if (err != CL_SUCCESS) {
- continue;
- }
- if (isDoubleBasedType(args->dtype)) {
- hasDouble = deviceHasNativeDouble(devID, &err);
- if (err != CL_SUCCESS) {
- continue;
- }
- if (!hasDouble) {
- numDevicesWithoutDoubles++;
- continue;
- }
- }
-
- step = calloc(1, sizeof(SolutionStep));
- if (step == NULL) {
- freeSolutionSeq(seq);
- return CL_OUT_OF_HOST_MEMORY;
- }
-
- step->funcID = funcID;
- step->args = *args;
- step->args.addrBits = deviceAddressBits(devID, &err);
- step->cmdQueue = commandQueues[j];
- step->numEventsInWaitList = numEventsInWaitList;
- step->eventWaitList = eventWaitList;
- step->event = NULL;
- if (events != NULL) {
- step->event = events + j;
- }
- step->pgran.wfSize = deviceWavefront(devID, &err);
- step->extraFlags = clblasArgsToKextraFlags(args, step->funcID);
- if (step->funcID == CLBLAS_SYR2K) {
- step->extraFlags |= KEXTRA_SYRK_2K_RANK;
- }
-
- step->device.id = devID;
- err = identifyDevice(&step->device);
- if (err != CL_SUCCESS) {
- freeSolutionSeq(seq);
- return err;
- }
-
- totalCUs += deviceComputeUnits(devID, &err);
- listAddToTail(seq, &step->node);
- }
- if (totalCUs == 0) {
- return (numDevicesWithoutDoubles == numCommandQueues) ?
- CL_INVALID_DEVICE : CL_INVALID_COMMAND_QUEUE;
- }
-
- memset(&extra, 0, sizeof(extra));
- memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
- extra.dtype = args->dtype;
-
- /* Split task between multiple command queues */
-
- if (funcID == CLBLAS_GEMM) {
- rectDivision(funcID, args, seq, totalCUs);
- }
- else if ((funcID == CLBLAS_SYRK) || (funcID == CLBLAS_SYR2K)) {
- triMatrixStripeDivision(funcID, args, seq, totalCUs);
- }
- else {
- stripeDivision(funcID, args, seq, totalCUs);
- }
-
- /* Some steps can be decomposed into several sequential substeps */
-
- parseEnvImplementation();
-
- // Function level decomposition
- for (i = listNodeFirst(seq); i != seq; i = i->next) {
- step = container_of(i, node, SolutionStep);
- if (step->cmdQueue == NULL) {
- continue;
- }
-
- if (step->funcID == CLBLAS_GEMM) {
- fixupGemmOffsets(&step->args, step->extraFlags, 0);
- continue;
- }
-
- i = decomposeProblemStep(step);
- }
-
- #ifdef DEBUG_2
- printf("Finding a kernel for each step\n");
- #endif
-
- /* Find a kernel for each step */
-
- for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
- i = i->next) {
-
- DeviceIdent *ident;
-
- step = container_of(i, node, SolutionStep);
- if (step->cmdQueue == NULL) {
- continue;
- }
-
- ident = &step->device.ident;
-
- /*
- * Set vendor dependent flags
- *
- * FIXME: thrown this kludge away when generator interface will
- * support passing ident info
- */
- if (ident->vendor == VENDOR_AMD) {
- step->extraFlags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD);
- }
-
- if (!findBestPattern(step)) {
- err = CL_OUT_OF_RESOURCES;
- break;
- }
-
- #ifdef DEBUG_2
- printf("Find best pattern finished\n");
- #endif
-
-
- pattern = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
- firstDimIdx = 2 - pattern->nrLevels;
- sid = makeSolverID(step->funcID, step->patternID);
-
- err = getQueueDevice(step->cmdQueue, &key.device);
- err = getQueueContext(step->cmdQueue, &key.context);
-
- detectProblemTails(step);
-
- extra.flags = step->extraFlags;
- if (pattern->sops->fixupArgs) {
- pattern->sops->fixupArgs(&step->args, &step->subdims[firstDimIdx],
- &extra);
- }
- step->extraFlags = extra.flags;
-
- key.nrDims = pattern->nrLevels;
- memset(key.subdims, 0, sizeof(key.subdims));
- memcpy(key.subdims, &step->subdims[firstDimIdx],
- sizeof(SubproblemDim) * key.nrDims);
-
- detectOffsets(step);
-
- extra.flags = step->extraFlags;
-
- need[CLBLAS_PREP_A_KERNEL] = isMatrixInImage(pattern, MATRIX_A);
- need[CLBLAS_PREP_B_KERNEL] = isMatrixInImage(pattern, MATRIX_B);
-
- /*
- * Now, find and enqueue each kernel. Generate and build the kernel
- * on the fly if this kernel is not presented neither in the cache
- * no in the storage
- */
- for (ktype = CLBLAS_COMPUTING_KERNEL;
- ktype < MAX_CLBLAS_KERNELS_PER_STEP; ktype++) {
- SubproblemDim prepDims[2];
-
- if (!need[ktype]) {
- continue;
- }
-
- extra.kernType = ktype;
-
- err = selectVectorization(step, &extra);
- if (err != CL_SUCCESS) {
- break;
- }
-
- kernel = NULL;
-
- //
- // Now that the build options is a part of EXTRA structure,
- // it is also a part of the kernelKey
- // Setting of build options need to be done before
- // findKernel()
- //
- memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
- setupBuildOpts(bopts, key.device, pattern);
- if (pattern->sops->setBuildOptions)
- {
- pattern->sops->setBuildOptions(bopts, (void*)(step));
- }
- memcpy(extra.buildOptions, bopts, BUILD_OPTS_MAXLEN);
-
- if (areKernelsCacheable()) {
- kernel = findKernel(clblasKernelCache, sid, &key, &extra);
- }
- if (kernel == NULL) {
- if (!loadData && !avoidLoadFromStorage(step)) {
- size_t MNK = (step->args.M + step->args.N + step->args.K) / 3;
- loadData = !getKernelInfo(&step->device, pattern->name,
- extra.dtype, step->extraFlags, (int)MNK, &buffer[0],
- &sizeBuffer[0]);
- }
- if (buffer[ktype] != NULL){
- kernel = loadKernel((const unsigned char**)&buffer[ktype],
- sizeBuffer[ktype], &key, &extra, &err);
- }
- else {
- SubproblemDim *dims;
-
- dims = (ktype == CLBLAS_COMPUTING_KERNEL) ? step->subdims :
- prepDims;
-
- #ifdef DEBUG_2
- printf("Build options used : %s\n", bopts);
- #endif
-
- kernel = makeKernel(key.device, key.context,
- pattern->sops->genKernel,
- &dims[firstDimIdx], &step->pgran,
- &extra, bopts, &err);
- }
-
- if (kernel == NULL) {
- break;
- }
-
- if (areKernelsCacheable()) {
- getKernel(kernel);
- if (addKernelToCache(clblasKernelCache, sid, kernel, &key,
- clblasKernelExtraCmp)) {
- putKernel(clblasKernelCache, kernel);
- }
- }
- } else {
- #ifdef DEBUG_CONTEXT
- printf("KERNEL FOUND IN CACHE\n");
- #endif
- }
- step->kernels[ktype] = kernel;
- }
- }
-
- if (err != CL_SUCCESS) {
- freeSolutionSeq(seq);
- }
-
- // free binary kernels
- for (ik = 0; ik < MAX_CLBLAS_KERNELS_PER_STEP; ++ik) {
- free(buffer[ik]);
- }
- return err;
-}
-
-static cl_uint
-getQueueMaxImages(cl_command_queue queue)
-{
- cl_int err;
- cl_device_id device;
- cl_command_queue_properties props;
- cl_bool imageSupport;
-
- imageSupport = CL_FALSE;
- err = getQueueDevice(queue, &device);
- if (err != CL_SUCCESS) {
- return 0;
- }
- err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(imageSupport),
- &imageSupport, NULL);
- if (!imageSupport) {
- return 0;
- }
-
- props = 0;
- err = getQueueProperties(queue, &props);
- if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
- return 0;
- }
-
- return 2;
-}
-
-static bool
-isTransBUsed(BlasFunctionID funcID)
-{
- if ((CLBLAS_GEMM == funcID) || (CLBLAS_GEMM2 == funcID) || (CLBLAS_GEMM_TAIL == funcID)) {
- return true;
- }
- else {
- return false;
- }
-}
-
-KernelExtraFlags
-clblasArgsToKextraFlags(const CLBlasKargs *args, BlasFunctionID funcID)
-{
- KernelExtraFlags flags = KEXTRA_NO_FLAGS;
-
- if (args->transA != clblasNoTrans) {
- flags |= KEXTRA_TRANS_A;
- }
-
- if (isTransBUsed(funcID) && args->transB != clblasNoTrans) {
- flags |= KEXTRA_TRANS_B;
- }
-
- if (isComplexType(args->dtype)) {
- if (args->transA == clblasConjTrans) {
- flags |= KEXTRA_CONJUGATE_A;
- }
- if (isTransBUsed(funcID) && args->transB == clblasConjTrans) {
- flags |= KEXTRA_CONJUGATE_B;
- }
- }
-
- if (args->order == clblasColumnMajor) {
- flags |= KEXTRA_COLUMN_MAJOR;
- }
- if ((funcID != CLBLAS_TRMM) && (funcID != CLBLAS_TRSM)) {
- // check if beta is zero
- ArgMultiplier z;
-
- memset(&z, 0, sizeof(z));
- if (!memcmp(&args->beta, &z, sizeof(z))) {
- flags |= KEXTRA_BETA_ZERO;
- }
- }
-
- if (funcID != CLBLAS_GEMM) {
- if (args->uplo == clblasUpper) {
- flags |= KEXTRA_UPPER_TRIANG;
- }
- if (args->side == clblasRight) {
- flags |= KEXTRA_SIDE_RIGHT;
- }
- if (args->diag == clblasUnit) {
- flags |= KEXTRA_UNIT_DIAGONAL;
- }
- }
- if (funcID == CLBLAS_GEMV || funcID == CLBLAS_SYMV) {
- if (args->ldb.vector == 1) {
- flags |= KEXTRA_INCX_ONE;
- }
- if (args->ldc.vector == 1) {
- flags |= KEXTRA_INCY_ONE;
- }
- }
-
- return flags;
-}
-
-static bool
-checkMatrixMemLevelSet(
- MemoryPattern *pattern,
- MatrixRole mrole,
- meml_set_t mask)
-{
- const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
- meml_set_t mset;
-
- if (mrole == MATRIX_C || extra == NULL) {
- return false;
- }
-
- switch (mrole) {
- case MATRIX_A:
- mset = extra->aMset;
- break;
- case MATRIX_B:
- mset = extra->bMset;
- break;
- default:
- break;
- }
-
- return ((mset & mask) != 0);
-}
-
-/* Next three functions: stripeDivision(), rectDivision() and
- * triMatrixStripeDivision(), split output matrix into set of non-intersected
- * rectangles. Area of each rectangle depends on the number of Compute Units,
- * available on a device of the given queue.
- * Division is also aligned on the DIVISION_ALIGNMENT boundary. It is measured
- * in number of elements.
- */
-
-/* This constant is used in:
- * - stripeDivision()
- * - rectDivision()
- * - triMatrixStripeDivision()
- * - decomposeTRXMStep()
- */
-static const size_t DIVISION_ALIGNMENT = 128;
-
-static size_t
-align(
- size_t value,
- size_t alignment)
-{
- /* This implementation assumes that alignment is the power of 2. */
- return (value + (alignment >> 1)) & (~(alignment - 1));
-}
-
-/* Stripe division is done according to the picture:
- *
- * +------+--+----+--+
- * | | | | |
- * | | | | |
- * | 1 | 2| 3 | 4|
- * | | | | |
- * | | | | |
- * +------+--+----+--+
- */
-static void
-stripeDivision(
- BlasFunctionID funcID,
- const CLBlasKargs *args,
- ListHead *seq,
- cl_uint totalCUs)
-{
- SolutionStep *step;
- ListNode *i;
- cl_int err;
- cl_device_id device;
- cl_uint nrCU;
- SubproblemDim size, offset, stepSize;
- bool first = true;
-
- kargsToProbDims(&offset, funcID, args, true);
- kargsToProbDims(&size, funcID, args, false);
-
- for (i = listNodeFirst(seq); i != seq; i = i->next) {
- step = container_of(i, node, SolutionStep);
- err = getQueueDevice(step->cmdQueue, &device);
- nrCU = deviceComputeUnits(device, &err);
-
- if (totalCUs == 0) {
- step->cmdQueue = NULL;
- continue;
- }
-
- stepSize = size;
- if (!first) {
- probDimsToKargs(&(step->args), funcID, &offset, true);
- }
-
- if (funcID == CLBLAS_GEMV) {
- if (totalCUs != nrCU) {
- stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
- stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
- if (stepSize.y == 0) {
- step->cmdQueue = NULL;
- }
- else if (stepSize.y > size.y) {
- stepSize.y = size.y;
- totalCUs = nrCU;
- }
- }
-
- offset.y += stepSize.y;
- size.y -= stepSize.y;
- }
- else {
- if (totalCUs != nrCU) {
- stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
- stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
- if (stepSize.x == 0) {
- step->cmdQueue = NULL;
- }
- else if (stepSize.x > size.x) {
- stepSize.x = size.x;
- totalCUs = nrCU;
- }
- }
- offset.x += stepSize.x;
- size.x -= stepSize.x;
- }
-
- totalCUs -= nrCU;
- probDimsToKargs(&(step->args), funcID, &stepSize, false);
- first = false;
- }
-}
-
-/* Rectangular division is done according to the picture:
- *
- * +------+-----+
- * | | 2 |
- * | | |
- * | 1 +--+--+
- * | |3 | 4|
- * | | | |
- * +------+--+--+
- *
- * The longest side is divided first.
- */
-static void
-rectDivision(
- BlasFunctionID funcID,
- const CLBlasKargs *args,
- ListHead *seq,
- cl_uint totalCUs)
- {
- SolutionStep *step, **sortedSteps;
- ListNode *i, *j;
- cl_int err;
- cl_device_id device;
- cl_uint nrCU, k, l;
- SubproblemDim size, offset, stepSize;
- unsigned int nrSteps = 0;
-
- /* 1. Sort steps according to the number of CU they have */
- /* NOTE: We expect small number of steps, so simple insertion sort
- * would be enough.
- */
-
- sortedSteps = calloc(listLength(seq), sizeof(*sortedSteps));
- // assert(sortedSteps != NULL);
-
- k = 0;
- for (i = listNodeFirst(seq); i != seq; i = i->next, nrSteps++) {
- step = container_of(i, node, SolutionStep);
- err = getQueueDevice(step->cmdQueue, &device);
-
- sortedSteps[k] = step;
- nrCU = deviceComputeUnits(device, &err);
-
- for (j = i->next; j != seq; j = j->next) {
- step = container_of(i, node, SolutionStep);
- err = getQueueDevice(step->cmdQueue, &device);
-
- if (nrCU < deviceComputeUnits(device, &err)) {
- sortedSteps[k] = step;
- nrCU = deviceComputeUnits(device, &err);
- }
- }
-
- k++;
- }
-
- /* 2. Calculate rectangle sizes */
-
- kargsToProbDims(&offset, funcID, args, true);
- kargsToProbDims(&size, funcID, args, false);
- stepSize = size;
-
- for (l = 0; l < k; l++) {
- step = sortedSteps[l];
- err = getQueueDevice(step->cmdQueue, &device);
- nrCU = deviceComputeUnits(device, &err);
-
- if (totalCUs == 0) {
- step->cmdQueue = NULL;
- continue;
- }
-
- stepSize = size;
- if (l) {
- probDimsToKargs(&(step->args), funcID, &offset, true);
- }
-
- if (size.y > size.x) {
- if (totalCUs != nrCU) {
- stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
- stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
- if (stepSize.y > size.y) {
- stepSize.y = size.y;
- totalCUs = nrCU;
- }
- else if (stepSize.y == 0) {
- step->cmdQueue = NULL;
- }
- }
- size.y -= stepSize.y;
- offset.y += stepSize.y;
- }
- else {
- if (totalCUs != nrCU) {
- stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
- stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
- if (stepSize.x > size.x) {
- stepSize.x = size.x;
- totalCUs = nrCU;
- }
- else if (stepSize.x == 0) {
- step->cmdQueue = NULL;
- }
- }
- size.x -= stepSize.x;
- offset.x += stepSize.x;
- }
-
- probDimsToKargs(&(step->args), funcID, &stepSize, false);
-
- #ifdef DEBUG_2
- printf("RectDivision:\n");
- printf("\t offM=%d, offN=%d, M=%d, N=%d\n", step->args.offsetM, step->args.offsetN, step->args.M, step->args.N);
- #endif
- totalCUs -= nrCU;
- }
-
- free(sortedSteps);
-}
-
-/* Dividing triangular matrix (N x N) horizontally:
- *
- * +----+
- * |\ |
- * +-\--+
- * | \ |
- * | \|
- * +----+
- *
- * Take into consideration the areas of triangles/trapezoids rather than
- * areas of stripes.
- */
-static void
-triMatrixStripeDivision(
- BlasFunctionID funcID,
- const CLBlasKargs *args,
- ListHead *seq,
- cl_uint totalCUs)
-{
- SolutionStep *step;
- ListNode *i;
- cl_int err;
- cl_device_id device;
- cl_uint nrCU;
- SubproblemDim size, offset, stepSize, stepOffset;
- size_t top;
-
- kargsToProbDims(&offset, funcID, args, true);
- kargsToProbDims(&size, funcID, args, false);
- top = 0;
-
- if (args->uplo == clblasUpper) {
- offset.y += size.y;
- }
- stepSize = size;
-
- for (i = listNodeFirst(seq); i != seq; i = i->next) {
- step = container_of(i, node, SolutionStep);
- err = getQueueDevice(step->cmdQueue, &device);
- nrCU = deviceComputeUnits(device, &err);
-
- if (totalCUs == 0) {
- step->cmdQueue = NULL;
- continue;
- }
-
- if (args->uplo == clblasLower) {
- stepOffset = offset;
- }
-
- if (totalCUs != nrCU) {
- stepSize.y = (size_t)(
- sqrt(top * top + (double)nrCU / totalCUs * size.y * (top + size.x)) - top);
- stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
- if ((stepSize.y == 0) || (stepSize.y > size.y)) {
- stepSize.y = size.y;
- totalCUs = nrCU;
- }
- else if (stepSize.y == 0) {
- step->cmdQueue = NULL;
- }
- /* We have to add special check because the direction of
- * splitting is 'bottom -> top' for UPLO = clblasUpper.
- */
- else if (offset.y != align(offset.y, DIVISION_ALIGNMENT)) {
- size_t o = align(offset.y - stepSize.y, DIVISION_ALIGNMENT);
- if (o > offset.y) {
- o -= 2 * DIVISION_ALIGNMENT;
- }
- stepSize.y = offset.y - o;
- }
- }
- else {
- stepSize.y = size.y;
- }
-
- size.y -= stepSize.y;
- top += stepSize.y;
- if (args->uplo == clblasLower) {
- offset.y += stepSize.y;
- }
- else {
- offset.y -= stepSize.y;
- stepOffset = offset;
- }
-
- probDimsToKargs(&(step->args), funcID, &stepOffset, true);
- probDimsToKargs(&(step->args), funcID, &stepSize, false);
-
- totalCUs -= nrCU;
- }
-}
-
-static cl_bool
-findBestPattern(SolutionStep *step)
-{
- cl_uint maxImages;
-
- maxImages = getQueueMaxImages(step->cmdQueue);
-
- do {
- /* It may be non first attempt. Ensure that there are not
- * hold images for this step
- */
- releaseStepImgs(step);
-
- step->patternID = selectPattern( step, maxImages );
-
- assert(step->patternID != (unsigned int)-1);
-
- #ifdef DEBUG_2
- printf("select Pattern Done\n");
- #endif
-
- getStepGranulation(step);
- #ifdef DEBUG_2
- printf("getStepGranulation done \n");
- #endif
-
- assertGranulation(step->subdims, mempat->nrLevels,
- &step->pgran, mempat->thLevel);
- if (getStepResources(step))
- break;
- } while (maxImages-- != 0);
-
- return (maxImages != (cl_uint)-1) ? CL_TRUE : CL_FALSE;
-}
-
-void
-detectProblemTails(SolutionStep *step)
-{
- SubproblemDim globDim, offDim;
- SubproblemDim *subdim;
- KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
-
- subdim = step->subdims;
-
- kargsToProbDims(&globDim, step->funcID, &step->args, false);
- kargsToProbDims(&offDim, step->funcID, &step->args, true);
-
- #ifdef DEBUG_2
- printf("detectProblemTails: subdimy=%d, subdimx=%d, subdimBwidth=%d\n", subdim->y, subdim->x, subdim->bwidth);
- #endif
- if (globDim.y % subdim->y) {
- kflags |= KEXTRA_TAILS_M;
- }
- if (globDim.x % subdim->x) {
- kflags |= KEXTRA_TAILS_N;
- }
- if (globDim.bwidth % subdim->bwidth) {
- kflags |= KEXTRA_TAILS_K;
- }
- if (clblasSolvers[step->funcID].memPatterns[step->patternID].nrLevels > 1) {
- if (globDim.y % subdim[1].y) {
- kflags |= KEXTRA_TAILS_M_LOWER;
- }
- if (globDim.x % subdim[1].x) {
- kflags |= KEXTRA_TAILS_N_LOWER;
- }
- if (globDim.bwidth % subdim[1].bwidth) {
- kflags |= KEXTRA_TAILS_K_LOWER;
- }
- }
- else {
- kflags |= (kflags & KEXTRA_TAILS_M) != 0 ? KEXTRA_TAILS_M_LOWER : 0;
- kflags |= (kflags & KEXTRA_TAILS_N) != 0 ? KEXTRA_TAILS_N_LOWER : 0;
- kflags |= (kflags & KEXTRA_TAILS_K) != 0 ? KEXTRA_TAILS_K_LOWER : 0;
- }
-
- // clean tails flags
- step->extraFlags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K
- | KEXTRA_TAILS_M_LOWER
- | KEXTRA_TAILS_N_LOWER
- | KEXTRA_TAILS_K_LOWER);
- // set tails flags
- step->extraFlags |= kflags;
-}
-
-void
-detectOffsets(SolutionStep *step)
-{
- const CLBlasKargs *args = &(step->args);
- KernelExtraFlags kflags = step->extraFlags;
-
- if (args->offsetM) {
- kflags |= KEXTRA_STARTM_NOT_ZERO;
- }
- if (args->offsetN) {
- kflags |= KEXTRA_STARTN_NOT_ZERO;
- }
- if (args->offA) {
- kflags |= KEXTRA_A_OFF_NOT_ZERO;
- }
- if (args->offBX) {
- kflags |= KEXTRA_BX_OFF_NOT_ZERO;
- }
- if (args->offCY) {
- kflags |= KEXTRA_CY_OFF_NOT_ZERO;
- }
-
- step->extraFlags = kflags;
-}
-
-//-----------------------------------------------------------------------------
-
-static unsigned int
-legacySelectPattern(
- BlasFunctionID funcID,
- unsigned int maxImages)
-{
- unsigned int id, i, n;
- MatrixRole mrole;
- MemoryPattern *pat;
- int score, maxScore = -1;
-
- id = -1;
- /*
- * Lookup all patterns, and assign a score per each matrix for
- * each pattern:
- * 0 - matrix is not cached
- * 2 - matrix is cached and stored in an image
- * 3 - matrix is cached and not stored in an image
- *
- * Find the pattern with the best score
- */
- pat = clblasSolvers[funcID].memPatterns;
-
- for (i = 0; i < clblasSolvers[funcID].nrPatterns; i++, pat++) {
- score = 0;
- n = 0;
-
- for (mrole = MATRIX_A; mrole <= MATRIX_B; mrole++) {
- if (isMatrixCached(pat, mrole)) {
- if (isMatrixInImage(pat, mrole)) {
- n++;
- score += 2;
- }
- else {
- score += 3;
- }
- }
- }
-
- if (n > maxImages) {
- continue;
- }
-
- if (score > maxScore) {
- maxScore = score;
- id = i;
- }
- }
-
- return id;
-}
-//-----------------------------------------------------------------------------
-
-unsigned int
-selectPattern( SolutionStep* pStep,
- unsigned int maxImages )
-{
- unsigned int i = 0;
- int selPatt = -1;
- int perf = -1;
- int maxPerf = -1;
- int funcID = pStep->funcID;
- unsigned int kflags = pStep->extraFlags;
-
- if (clblasSolvers[funcID].defaultPattern != -1) {
-// assert(clblasSolvers[funcID].defaultPattern < clblasSolvers[funcID].nrPatterns);
- return clblasSolvers[funcID].defaultPattern;
- }
-
- // select best-performing pattern for current case
- for( i = 0; i < clblasSolvers[funcID].nrPatterns; i++ ){
-
- if( NULL != clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf ){
-
- perf = clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf(
- kflags,
- (void*)&pStep->args);
-
- if( perf > maxPerf ){
- selPatt = i;
- maxPerf = perf;
- }
- }
- // if not all patterns provide performace estimation functions
- // use legacy pattern selection
- else{
- return legacySelectPattern( funcID, maxImages );
- }
- }
-
- return selPatt;
-}
-
-//-----------------------------------------------------------------------------
-
-/*
- * Check if tile sizes exceed the entire problem and adjust them
- * accordingly if yes
- */
-bool
-dimensionsExceedProblemSize(SolutionStep *step) {
- SubproblemDim probDim;
- SubproblemDim *dims = step->subdims;
- BlasFunctionID funcID = step->funcID;
- MemoryPattern *mempat =
- &clblasSolvers[funcID].memPatterns[step->patternID];
-
- /*
- * Looks like kernels of other functions handle the case themselves
- * and don't expect that everyone can adjust chosen decomposition
- */
- if (!( (funcID == CLBLAS_GEMV) ||
- (funcID == CLBLAS_SYMV) ||
- (funcID == CLBLAS_GEMM) ||
- (funcID == CLBLAS_TRMM) ||
- (funcID == CLBLAS_TRSM) ||
- (funcID == CLBLAS_SYRK) ||
- (funcID == CLBLAS_SYR2K)) ) {
-
- return false;
- }
-
-
- kargsToProbDims(&probDim, step->funcID, &step->args, false);
-
- if (mempat->nrLevels != 2) {
- return false;
- }
- dims = &dims[1];
-
- if (dims->x > probDim.x ||
- dims->y > probDim.y ||
- dims->bwidth > probDim.bwidth) {
- return true;
- }
-
- return false;
-}
-
-void
-getMinimalStepGranulation(SolutionStep *step)
-{
- SubproblemDim *decompDims = NULL;
- SubproblemDim probDims[2];
- size_t factor = 0;
-
- // EINVAL
- if( NULL == step ){
- return;
- }
-
- if (step->funcID == CLBLAS_GEMM2)
- {
- return;
- }
-
- kargsToProbDims( probDims, step->funcID, &step->args, false);
- decompDims = step->subdims;
-
- // All exceeding dimensions are set to 1
-
- if ( decompDims[1].itemX > probDims->x ) {
-
- factor = decompDims[1].itemX;
- decompDims[1].itemX = 1;
- decompDims[1].x /= factor;
- decompDims[0].itemX /= factor;
- decompDims[0].x /= factor;
- }
-
- if ( decompDims[1].itemY > probDims->y ) {
-
- factor = decompDims[1].itemY;
- decompDims[1].itemY = 1;
- decompDims[1].y /= factor;
- decompDims[0].itemY /= factor;
- decompDims[0].y /= factor;
- }
-
- if( decompDims[1].bwidth > probDims->bwidth ){
- decompDims[0].bwidth /= decompDims[1].bwidth;
- decompDims[1].bwidth = 1;
- }
-}
-
-void
-getStepGranulation(SolutionStep *step)
-{
- SubproblemDim *dims = step->subdims;
- cl_device_id devID;
- double time;
- int status = GF_ERROR;
- size_t MNK;
-
- #ifdef DEBUG_2
- printf("getStepGranulation called........\n");
- #endif
-
- MemoryPattern *mempat =
- &clblasSolvers[step->funcID].memPatterns[step->patternID];
-
- #ifdef DEBUG_2
- printf("Got mempat structure.........0x%p\n", mempat);
- #endif
-
-
- #ifdef DEBUG_2
- if ( mempat == NULL)
- {
- printf("mempat pointer is NULL...\n");
- } else {
- printf("mempat pointer is non-null..\n");
- if (mempat->sops == NULL)
- printf("sops is NULL\n");
- else
- if (mempat->sops->getFlags == NULL)
- printf("getFlags() is NULL\n");
- fflush(stdout);
- }
- #endif
-
- getQueueDevice(step->cmdQueue, &devID);
-
- #ifdef DEBUG_2
- printf("QueueDevice done...\n");
- #endif
-
-
- // try to load decomposition info from the storage
-
- /*
- * FIXME: It's a workaround so that to avoid getting some decomposition
- * sizes leading to strange hang ups
- */
- if (!avoidLoadFromStorage(step)) {
- #ifdef DEBUG_2
- printf("!avoidLoadFromStorage...Inside if\n");
- #endif
-
- MNK = (step->args.M + step->args.N + step->args.K)/3;
- if (mempat->sops->innerDecompositionAxis) {
- size_t ld;
- // bas - banks aligned size, in bytes, should be
- // number of channels * bytes per channel
- // here it is set to 8*256 = 2048 = 512 floats
- size_t bas = 8*256;
- if (mempat->sops->innerDecompositionAxis(&step->args) ==
- DECOMP_AXIS_X) {
- ld = step->args.ldb.matrix;
- }
- else {
- ld = step->args.lda.matrix;
- }
-
- if ((ld * dtypeSize(step->args.dtype)) % bas == 0) {
- //special bad case
- MNK = 0;
- }
- }
-
- if( step->funcID != CLBLAS_GEMM2 )
- {
- status = getGranularityInfo(&step->device, mempat->name,
- step->args.dtype, step->extraFlags,
- (int)MNK, dims, &step->pgran, &time);
- }
- /*
- * Disable blocking for implementations dealing with cache reads
- * from the global memory
- */
- //if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
- // dims[0].bwidth = dims[1].bwidth;
- //}
- }
- #ifdef DEBUG_2
- printf("isLoadFromStorage done..\n");
- #endif
-
- //Query solver for default granulation
- if (status == GF_ERROR) {
- // temporary mock, untill all solvers will return required default problem granulation
- // TODO: deprecate the getDefaultStepGranulation(step) function
- if(NULL==mempat->sops->getDefaultDecomp)
- {
- getDefaultStepGranulation(step);
- }
- else
- {
- mempat->sops->getDefaultDecomp( &step->pgran,
- step->subdims,
- MAX_SUBDIMS,
- (void*)&step->args);
- }
- }
- if (dimensionsExceedProblemSize(step)) {
- getMinimalStepGranulation(step);
- }
-}
-
-void
-getDefaultStepGranulation(SolutionStep *step)
-{
- unsigned int nrFloats;
- MemoryPattern *mempat =
- &clblasSolvers[step->funcID].memPatterns[step->patternID];
- SubproblemDim *dims = step->subdims;
- cl_ulong ldsSize;
- size_t wgX, wgY;
- bool square;
- SDimComponent component = SDIM_BWIDTH;
- DataType dtype = step->args.dtype;
- size_t tsize = dtypeSize(dtype);
- unsigned int i;
- SolverFlags sflags;
- unsigned int bcoeff;
- bool bothCached, fixedBw = false;
- cl_device_id devID;
- PGranularity *pgran = &step->pgran;
- size_t maxWorkGroupSize;
- int vecLen;
- size_t subdimyFactor = 1;
- size_t subdimxFactor = 1;
-
- #ifdef DEBUG_2
- printf("getDefaultStepGranualtion called...\n");
- #endif
- nrFloats = (unsigned int)(dtypeSize(dtype) / sizeof(cl_float));
- square = ((mempat->sops->getFlags() & SF_TOP_INPUT_SQUARE_BLOCKS) != 0);
- bothCached = isMatrixCached(mempat, MATRIX_A) &&
- isMatrixCached(mempat, MATRIX_B);
- if (step->cmdQueue != NULL) {
- getQueueDevice(step->cmdQueue, &devID);
- }
- else {
- devID = step->device.id;
- }
- clGetDeviceInfo(devID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ldsSize),
- &ldsSize, NULL);
- clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
- sizeof(size_t), &maxWorkGroupSize, NULL);
-
- /*
- * Setup dimensions allowing to use more or less effectively the local
- * memory or cache;
- */
-
- if (square) {
- dims[0].x = (dtype == TYPE_COMPLEX_DOUBLE) ? 16 : 32;
- /*
- * FIXME: for now, we restrict ourselves with square blocks due
- * to compilation issues
- */
- dims[0].y = dims[0].x; //(dtype == TYPE_FLOAT) ? 32 : 16
- dims[0].bwidth = dims[0].y;
- bcoeff = nrFloats;
- wgY = DEFAULT_BUFS_LSIZE_0;
- wgX = DEFAULT_BUFS_LSIZE_1;
- } else {
- bcoeff = (dtype == TYPE_COMPLEX_DOUBLE) ? 2 : 1;
-
- if (bothCached) {
- wgY = DEFAULT_CACHED_BUFS_LSIZE_0;
- wgX = DEFAULT_CACHED_BUFS_LSIZE_1;
- }
- else {
- wgY = DEFAULT_BUFS_LSIZE_0;
- wgX = DEFAULT_BUFS_LSIZE_1;
- }
-
- if (step->funcID == CLBLAS_GEMM2)
- {
- subdimyFactor = 2;
- subdimxFactor = 1;
- bcoeff = 4; // 16/bcoeff = 4 - Thats the panel width we want
- }
-
- if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV)) {
- if (maxWorkGroupSize >= 256)
- {
- wgX = 16;
- wgY = 16;
- } else if (maxWorkGroupSize >= 128)
- {
- wgX = 8;
- wgY = 16;
- } else {
- //
- // PENDING: What if maxWorkGroupSize < 64 ????
- //
- wgX = 8;
- wgY = 8;
- }
- }
-
-
- /*
- * Set block sizes such so the work group would access the whole
- * memory channel or not exceed cache associativity for the modern
- * AMD GPU families.
- *
- * FIXME: throw the hardcoded constants away
- */
- if (isMatrixInImage(mempat, MATRIX_A) ||
- isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_A)) {
-
- dims[0].y = (64 * subdimyFactor) / nrFloats;
- fixedBw = true;
- }
- else {
- dims[0].y = (32 * subdimyFactor);
- }
-
- if (isMatrixInImage(mempat, MATRIX_B) ||
- isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
-
- dims[0].x = (64 * subdimxFactor) / nrFloats;
- fixedBw = true;
- }
- else {
- dims[0].x = (32 * subdimxFactor);
- }
-
- if (step->funcID == CLBLAS_GEMM2) {
- int count=0;
-
- //
- // NOTE:
- // wgX and wgY setting for this function must be the same as
- // CLBLAS_GEMM_TAIL below.
- //
- //vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
- //
- // PENDING: 16x16 works best on CYPRESS and 16x8 for Cayman
- //
- wgY = 8*subdimyFactor;
- wgX = 8*subdimxFactor;
- while((wgY * wgX) > maxWorkGroupSize)
- {
- if (count & 1)
- {
- wgY /= 2;
- dims[0].y /= 2;
- } else {
- wgX /= 2;
- dims[0].x /= 2;
- }
- count++;
- }
- }
-
- if (step->funcID == CLBLAS_GEMM_TAIL) {
- //
- // NOTE: wgY and wgX must be same as what is set for CLBLAS_GEMM2 above
- //
- vecLen = 1;
-
- //
- // PENDING: What if maxWorkGroupSize < 64 ????
- //
- wgY = 8;
- wgX = 8;
- dims[0].y = wgY ;
- dims[0].x = wgX ;
- }
-
- if((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
- {
- wgY = 8;
- wgX = 8;
- dims[0].y = 64;
- dims[0].x = 64;
- }
-
- dims[0].bwidth = 16 / bcoeff;
- }
-
- /*
- * Prevent using more than 1/2 of LDS so as to have at least 2 work groups
- * per compute unit
- */
- if (ldsSize && mempat->sops->isFitToLDS) {
- ldsSize /= 2;
-
- while (!mempat->sops->isFitToLDS(dims, dtype, ldsSize, &step->args)) {
- /*
- * decrease current component and setup this one to decrease
- * on the next step; do not grow down block width below the
- * value with which the block line takes size of a float4 vector
- */
- if (square) {
- dims[0].x /= 2;
- dims[0].y /= 2;
- dims[0].bwidth /= 2;
- }
- else {
- switch (component) {
- case SDIM_X:
- dims[0].x /= 2;
- if (dims[0].bwidth * tsize == sizeof(cl_float4)) {
- component = SDIM_Y;
- }
- else {
- component = SDIM_BWIDTH;
- }
- break;
- case SDIM_Y:
- dims[0].y /= 2;
- component = SDIM_X;
- break;
- case SDIM_BWIDTH:
- dims[0].bwidth /= 2;
- component = SDIM_Y;
- break;
- }
- }
- }
-
- assert(dims[0].x > 0 && dims[0].y > 0 &&
- dims[0].bwidth * tsize >= sizeof(cl_float4));
- }
-
- /*
- * adjust local size if a subproblem is not divisible
- * between all local threads
- */
- for (; (wgY > 1) && (dims[0].y < wgY); wgY /= 2) { }
- for (; (wgX > 1) && (dims[0].x < wgX); wgX /= 2) { }
-
- sflags = mempat->sops->getFlags();
- if (sflags & SF_WSPACE_2D) {
- pgran->wgDim = 2;
- dims[0].itemY = dims[0].y;
- pgran->wgSize[0] = (unsigned int)wgY;
- pgran->wgSize[1] = (unsigned int)wgX;
- }
- else {
- pgran->wgDim = 1;
- pgran->wgSize[0] = (unsigned int)(wgX * wgY);
- pgran->wgSize[1] = 1;
- }
-
- /*
- * Divide the work between threads
- */
- dims[1].itemX = dims[0].x / wgX;
- dims[1].itemY = dims[0].y / wgY;
- dims[1].x = dims[1].itemX;
- dims[1].y = dims[1].itemY;
-
- if ((mempat->nrLevels == 1) && square) {
- dims[1].bwidth = dims[1].y;
- }
- else {
- i = fixedBw ? 4 : (8 / nrFloats);
- dims[1].bwidth = szmin(i, dims[0].bwidth);
- }
-
- dims[0].itemX = dims[0].x;
- dims[0].itemY = dims[0].y;
-
- /*
- * FIXME: Now, there are issues with generating kernels with non square
- * tiles in LDS less TRSM due to some fundamental restriction
- * of the core generator logic. Deprecate this kludge when
- * they will be eliminated
- */
-#if 1
- if ((step->funcID == CLBLAS_TRSM) && (step->patternID == 2)) {
- dims[1].bwidth = dims[1].y;
- }
-#endif
- if (funcHasTriangMatrix(step->funcID) && (pgran->wgDim == 1)) {
- dims[0].itemY = SUBDIM_UNUSED;
- if (mempat->nrLevels == 1) {
- dims[1].itemY = SUBDIM_UNUSED;
- }
- }
-
- if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
- dims[0].bwidth = dims[1].bwidth;
- }
- /*
- * Ensure decomposition size for vectors in case
- * of level 2 routines equal to 1.
- */
- if (funcBlasLevel(step->funcID) == 2) {
- size_t xBlocks;
-
- xBlocks = dims[0].x / dims[1].x;
- dims[0].x = 1;
- dims[1].itemX = 1;
- dims[1].x = 1;
- dims[0].bwidth = dims[1].bwidth * xBlocks;
- }
-
- // fixup work group size in respect with desired work dispatch order
- if ((pgran->wgDim == 2) && mempat->sops->innerDecompositionAxis) {
- if (mempat->sops->innerDecompositionAxis(&step->args) ==
- DECOMP_AXIS_X) {
-
- unsigned int u;
-
- u = pgran->wgSize[0];
- pgran->wgSize[0] = pgran->wgSize[1];
- pgran->wgSize[1] = u;
- }
- }
- //printf("GDSG: suby = %lu, subx = %lu, bwidth0=%lu, bwidth1=%lu\n", dims[0].y, dims[0].x, dims[0].bwidth, dims[1].bwidth);
-}
-
-static bool
-avoidLoadFromStorage(SolutionStep *step)
-{
- bool notDiv;
- MemoryPattern *mempat =
- &clblasSolvers[step->funcID].memPatterns[step->patternID];
- bool bothCached = isMatrixCached(mempat, MATRIX_A) &&
- isMatrixCached(mempat, MATRIX_B);
-
- if (bothCached) {
- return false;
- }
-
- if ((step->funcID == CLBLAS_GEMM2) && ((step->args.pigFuncID == CLBLAS_SYMM) || (step->args.pigFuncID == CLBLAS_HEMM)) )
- {
- // FIXME: Assuming that returning "true" will load defaultDecomposition sizes
- // But the statement below on TRSM is a bit confusing.
- // Returning FALSE here will load from storage in getStepGranulation()
- return true;
- }
-
- /*
- * don't load from storage data for LDS gemm,
- * not integrally divisible
- */
- notDiv = (step->args.M % 64) || (step->args.N % 64) || (step->args.K % 64);
-
- return ((step->funcID == CLBLAS_GEMM) && notDiv);
-}
-
-static bool
-getStepResources(SolutionStep *step)
-{
- int i = 0;
- size_t tsize;
- unsigned int vecLen;
- size_t minWidth, minHeight, bestHeight, minSize, bestSize;
- MatrixRole mrole;
- cl_device_id devID;
- cl_context ctx;
- MemoryPattern *mempat;
- SubproblemDim probDim;
- CLBlasKargs *kargs = &step->args;
- bool ret = true;
-
- tsize = dtypeSize(kargs->dtype);
- vecLen = (unsigned int)(sizeof(cl_float4) / tsize);
- kargsToProbDims(&probDim, step->funcID, &step->args, false);
- getQueueContext(step->cmdQueue, &ctx);
- getQueueDevice(step->cmdQueue, &devID);
-
- mempat = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
-
- for (mrole = MATRIX_A, i = 0; mrole < MATRIX_C; mrole++) {
- if (isMatrixInImage(mempat, mrole)) {
- if (step->funcID == CLBLAS_TRSM) {
- //blocks
- unsigned int packRate;
- clblasOrder packOrder;
- size_t pitch;
- size_t matrWidth, matrHeight;
- CLBLASKernExtra extra;
-
- memset(&extra, 0, sizeof(extra));
- extra.dtype = kargs->dtype;
- extra.flags = step->extraFlags;
-
- mempat->sops->imgPackMode(&extra,
- step->subdims, mrole,
- &packRate, &packOrder);
-
- // minimal size parameters
- pitch = matrBlockPitch(step->subdims, mrole, kargs->dtype,
- kargs->side);
- matrWidth = matrBlockPitch(&probDim, mrole, kargs->dtype,
- kargs->side);
- matrHeight = matrBlockHeight(&probDim, mrole, kargs->side);
-
- //One panel should fit to image
- if (packOrder == clblasRowMajor) {
- minWidth = divRoundUp(matrWidth, pitch) * pitch / vecLen;
- minHeight = packRate;
-
- minSize = minWidth * minHeight;
- // size of image to store all blocks
- bestSize = minHeight * (minWidth + pitch / vecLen) *
- divRoundUp(matrHeight, packRate) / 2;
- }
- else {
- minWidth = pitch / vecLen;
- minHeight = divRoundUp(matrHeight, packRate) * packRate;
-
- minSize = minWidth * minHeight;
- bestSize = minWidth * (minHeight + packRate) *
- divRoundUp(matrWidth, pitch) / 2;
- }
- minSize = bestSize;
- }
- else {
- //panels
- getSuitableImageSizes(&minWidth, &minHeight, &bestHeight,
- mrole, kargs, vecLen, step->subdims);
- minSize = minWidth * minHeight;
- bestSize = minWidth * bestHeight;
- }
-
- kargs->scimage[i] = getSCImage(ctx, devID, bestSize,
- minSize, minWidth);
- if (kargs->scimage[i] == NULL) {
- ret = false;
- break;
- }
-
- i++;
- }
- }
-
- return ret;
-}
-
-static void
-getSuitableImageSizes(
- size_t *minWidth,
- size_t *minHeight,
- size_t *bestHeight,
- MatrixRole mrole,
- CLBlasKargs *kargs,
- unsigned int vecLen,
- SubproblemDim *subdims)
-{
- size_t alignedM, alignedN, alignedK;
- alignedM = divRoundUp(kargs->M, subdims->y);
- alignedM *= subdims->y;
- alignedN = divRoundUp(kargs->N, subdims->x);
- alignedN *= subdims->x;
- alignedK = divRoundUp(kargs->K, subdims->bwidth);
- alignedK *= subdims->bwidth;
- switch (mrole) {
- case MATRIX_A:
- *minWidth = alignedK / vecLen;
- *bestHeight = alignedM;
- *minHeight = subdims->y;
- break;
- case MATRIX_B:
- *minWidth = alignedK / vecLen;
- *bestHeight = alignedN;
- *minHeight = subdims->x;
- break;
- case MATRIX_C:
- *minWidth = alignedN / vecLen;
- *bestHeight = alignedM;
- *minHeight = subdims->y;
- break;
- default:
- break;
- }
-}
-
-/*
- * TRxM -> TRxM + GEMM + TRxM
- *
- * When talking about matrix A splitting the following numbering is used:
- *
- * +---+---+
- * | 1 | 2 |
- * +---+---+
- * | 3 | 4 |
- * +---+---+
- */
-static ListNode*
-decomposeTRXMStep(SolutionStep *step)
-{
- CLBlasKargs *kargs = &(step->args);
- SolutionStep *trxm1 = NULL, *gemm = NULL, *trxm2 = NULL, *tmp;
- clblasUplo position;
- SubproblemDim size, offset;
- int swap;
- cl_float f;
- cl_double d;
- clblasImplementation impl = clblasDefaultGemm;
- size_t offsetK = 0;
-
- // skip decomposition for a trmm case which works faster without it
- if (step->funcID == CLBLAS_TRMM && !isDoubleBasedType(step->args.dtype) &&
- isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
- return &(step->node);
- }
-
- /* Implementation specific checks */
-
- if ((getGemmPreferredPattern() != clblasDefaultGemm) &&
- (getGemmPreferredPattern() != clblasBlockGemmWithCaching)) {
-
- return &(step->node);
- }
- if (step->funcID == CLBLAS_TRMM) {
- impl = getTrmmPreferredPattern();
- if ((impl != clblasDefaultTrmm) &&
- (impl != clblasBlockTrmmWithCaching)) {
-
- return &(step->node);
- }
- }
- else {
- impl = getTrsmPreferredPattern();
- if ((impl != clblasDefaultTrsm) &&
- (impl != clblasBlockTrsmWithCaching) &&
- (impl != clblasBlockTrsmWithoutLds)) {
-
- return &(step->node);
- }
- }
-
- if ((kargs->side == clblasLeft) &&
- (kargs->M < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
- return &(step->node);
- }
- if ((kargs->side == clblasRight) &&
- (kargs->N < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
- return &(step->node);
- }
-
- trxm1 = calloc(1, sizeof(SolutionStep));
- gemm = calloc(1, sizeof(SolutionStep));
- trxm2 = calloc(1, sizeof(SolutionStep));
- if ((trxm1 == NULL) || (gemm == NULL) || (trxm2 == NULL)) {
- if (trxm1 != NULL) {
- free(trxm1);
- }
- if (gemm != NULL) {
- free(gemm);
- }
- if (trxm2 != NULL) {
- free(trxm2);
- }
- return &(step->node);
- }
- memcpy(trxm1, step, sizeof(SolutionStep));
- memcpy(gemm, step, sizeof(SolutionStep));
- memcpy(trxm2, step, sizeof(SolutionStep));
-
- gemm->funcID = CLBLAS_GEMM;
- gemm->args.C = kargs->B;
- gemm->args.ldc.matrix = kargs->ldb.matrix;
- gemm->args.offCY = kargs->offBX;
- switch (kargs->dtype) {
- case TYPE_FLOAT:
- if (step->funcID == CLBLAS_TRSM) {
- if (gemm->args.alpha.argFloat != 0.0f) {
- gemm->args.alpha.argFloat = -1 / gemm->args.alpha.argFloat;
- }
- }
- gemm->args.beta.argFloat = 1.0f;
- break;
- case TYPE_DOUBLE:
- if (step->funcID == CLBLAS_TRSM) {
- if (gemm->args.alpha.argDouble != 0.0f) {
- gemm->args.alpha.argDouble = -1 / gemm->args.alpha.argDouble;
- }
- }
- gemm->args.beta.argDouble = 1.0f;
- break;
- case TYPE_COMPLEX_FLOAT:
- if (step->funcID == CLBLAS_TRSM) {
- f = CREAL(gemm->args.alpha.argFloatComplex) *
- CREAL(gemm->args.alpha.argFloatComplex) +
- CIMAG(gemm->args.alpha.argFloatComplex) *
- CIMAG(gemm->args.alpha.argFloatComplex);
- if (f != 0.0f) {
- gemm->args.alpha.argFloatComplex = floatComplex(
- -CREAL(gemm->args.alpha.argFloatComplex) / f,
- CIMAG(gemm->args.alpha.argFloatComplex) / f);
- }
- }
- gemm->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
- break;
- case TYPE_COMPLEX_DOUBLE:
- if (step->funcID == CLBLAS_TRSM) {
- d = CREAL(gemm->args.alpha.argDoubleComplex) *
- CREAL(gemm->args.alpha.argDoubleComplex) +
- CIMAG(gemm->args.alpha.argDoubleComplex) *
- CIMAG(gemm->args.alpha.argDoubleComplex);
- if (d != 0.0f) {
- gemm->args.alpha.argDoubleComplex = doubleComplex(
- -CREAL(gemm->args.alpha.argDoubleComplex) / d,
- CIMAG(gemm->args.alpha.argDoubleComplex) / d);
- }
- }
- gemm->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
- break;
- }
-
- /* Actual position of matrix A's data to use */
- if (kargs->transA == clblasNoTrans) {
- position = kargs->uplo;
- }
- else {
- position = (kargs->uplo == clblasUpper) ? clblasLower :
- clblasUpper;
- }
-
- /* Map trxm1 to A1 */
- kargsToProbDims(&size, trxm1->funcID, &(trxm1->args), false);
- size.y = align(size.y / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(trxm1->args), trxm1->funcID, &size, false);
-
- /* Map trxm2 to A4 */
- kargsToProbDims(&offset, trxm2->funcID, &(trxm2->args), true);
- kargsToProbDims(&size, trxm2->funcID, &(trxm2->args), false);
- offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
- size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(trxm2->args), trxm2->funcID, &offset, true);
- probDimsToKargs(&(trxm2->args), trxm2->funcID, &size, false);
-
-
- if (kargs->side == clblasLeft) {
- trxm1->args.K = trxm1->args.M;
- trxm2->args.K = trxm2->args.M;
-
- gemm->args.transB = clblasNoTrans;
-
- if (position == clblasUpper) {
- /* Map gemm to A2 */
- kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
- size.y = align(size.y / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
- offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- }
- else {
- /* Map gemm to A3 */
- kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
- kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
- offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
- size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
- probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
- gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- }
- }
- else {
- trxm1->args.K = trxm1->args.N;
- trxm2->args.K = trxm2->args.N;
-
- gemm->args.transA = clblasNoTrans;
- gemm->args.A = kargs->B;
- gemm->args.lda.matrix = kargs->ldb.matrix;
- gemm->args.offA = kargs->offBX;
- gemm->args.transB = kargs->transA;
- gemm->args.B = kargs->A;
- gemm->args.ldb.matrix = kargs->lda.matrix;
- gemm->args.offBX = kargs->offA;
-
- if (position == clblasUpper) {
- /* Map gemm to A2 */
- kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
- kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
- offset.x += align(size.x / 2, DIVISION_ALIGNMENT);
- size.x -= align(size.x / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
- probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
- gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- }
- else {
- /* Map gemm to A3 */
- kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
- size.x = align(size.x / 2, DIVISION_ALIGNMENT);
- probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
- offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
- }
- }
-
- trxm1->extraFlags = clblasArgsToKextraFlags(&(trxm1->args), trxm1->funcID);
- gemm->extraFlags = clblasArgsToKextraFlags(&(gemm->args), gemm->funcID);
- trxm2->extraFlags = clblasArgsToKextraFlags(&(trxm2->args), trxm2->funcID);
-
- fixupGemmOffsets(&gemm->args, gemm->extraFlags, offsetK);
-
- /* Swap trxm1 and trxm2 if needed. */
-
- swap = 0;
- if (kargs->side == clblasLeft) {
- if ((step->funcID == CLBLAS_TRMM) && (position == clblasLower)) {
- swap = 1;
- }
- if ((step->funcID == CLBLAS_TRSM) && (position == clblasUpper)) {
- swap = 1;
- }
- }
- else {
- if ((step->funcID == CLBLAS_TRMM) && (position == clblasUpper)) {
- swap = 1;
- }
- if ((step->funcID == CLBLAS_TRSM) && (position == clblasLower)) {
- swap = 1;
- }
- }
- if (swap) {
- tmp = trxm1;
- trxm1 = trxm2;
- trxm2 = tmp;
- }
- /* Tie the sequence trmm1 - gemm - trmm2 together. */
-
- trxm1->event = decomposeEventsAlloc();
- trxm1->node.next = &(gemm->node);
-
- gemm->numEventsInWaitList = 1;
- gemm->eventWaitList = trxm1->event;
- gemm->event = decomposeEventsAlloc();
- gemm->node.prev = &(trxm1->node);
- gemm->node.next = &(trxm2->node);
-
- trxm2->numEventsInWaitList = 1;
- trxm2->eventWaitList = gemm->event;
- trxm2->node.prev = &(gemm->node);
-
- /* Insert new sequence instead of current step */
-
- trxm1->node.prev = step->node.prev;
- (trxm1->node.prev)->next = &(trxm1->node);
- step->node.prev = NULL;
-
- trxm2->node.next = step->node.next;
- (trxm2->node.next)->prev = &(trxm2->node);
- step->node.next = NULL;
-
- freeSolutionStep(&(step->node));
-
- return &(trxm2->node);
-}
-
-/*
- * Decompose a SYRK problem in order to evaluate the diagonal part
- * separately. It's useful since the compiler allocates huge number
- * of registers for a code processing the diagonal.
- */
-static ListNode*
-decomposeSYRKStep(SolutionStep *step)
-{
- CLBlasKargs *kargs = &step->args;
- SolutionStep *syrk2 = NULL;
- size_t thresh;
- ListNode *next;
-
- /*
- * Tail prediction. Believe that tile sizes will not exceed 8.
- * Disable decomposition if there are not subproblem tails at
- * the tile level because it can likely slowdown since diagonal
- * update is optimized. Actual tail detection is done after
- * the math decomposition. So the kludge is forced.
- */
- if ((kargs->M % 8 == 0) && (kargs->N % 8 == 0)) {
- return &(step->node);
- }
-
- thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
- if (kargs->M < thresh / 2) {
- return &(step->node);
- }
-
- syrk2 = malloc(sizeof(SolutionStep));
- if (syrk2 == NULL) {
- return &(step->node);
- }
-
- step->extraFlags |= KEXTRA_SYRK_SEPARATE_DIAGONAL;
- memcpy(syrk2, step, sizeof(SolutionStep));
- syrk2->extraFlags |= KEXTRA_SYRK_EVALUATE_DIAGONAL;
-
- next = step->node.next;
-
- /* Synchronize the steps */
-
- /*
- * This is to not disturb synchronization between the current and the next
- * step or to put the output user event to the tail of the chain if syrk2
- * is the last step
- */
- syrk2->event = step->event;
- step->event = decomposeEventsAlloc();
- syrk2->numEventsInWaitList = 1;
- syrk2->eventWaitList = step->event;
-
- /* Insert the additional step to the list */
- step->node.next = &syrk2->node;
- syrk2->node.prev = &step->node;
- syrk2->node.next = next;
- next->prev = &syrk2->node;
-
- return &(syrk2->node);
-}
-
-static ListNode*
-decomposeSYR2KStep(SolutionStep *step)
-{
- CLBlasKargs *kargs = &(step->args);
- SolutionStep *syrk1 = NULL, *syrk2 = NULL;
- size_t thresh;
- ListNode *node;
-
- /* SYR2K implementation is done as blocked with cache-usage optimization
- * only. Therefore, no implementation specific checks.
- */
-
- thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
- if (kargs->M < thresh / 2) {
- return &(step->node);
- }
-
- syrk1 = calloc(1, sizeof(SolutionStep));
- syrk2 = calloc(1, sizeof(SolutionStep));
- if ((syrk1 == NULL) || (syrk2 == NULL)) {
- if (syrk1 != NULL) {
- free(syrk1);
- }
- if (syrk2 != NULL) {
- free(syrk2);
- }
- return &(step->node);
- }
- memcpy(syrk1, step, sizeof(SolutionStep));
- memcpy(syrk2, step, sizeof(SolutionStep));
-
- syrk2->args.A = kargs->B;
- syrk2->args.lda.matrix = kargs->ldb.matrix;
- syrk2->args.offA = kargs->offBX;
- syrk2->args.B = kargs->A;
- syrk2->args.ldb.matrix = kargs->lda.matrix;
- syrk2->args.offBX = kargs->offA;
- switch (kargs->dtype) {
- case TYPE_FLOAT:
- syrk2->args.beta.argFloat = 1.0f;
- break;
- case TYPE_DOUBLE:
- syrk2->args.beta.argDouble = 1.0f;
- break;
- case TYPE_COMPLEX_FLOAT:
- syrk2->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
- break;
- case TYPE_COMPLEX_DOUBLE:
- syrk2->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
- break;
- }
-
- syrk1->extraFlags = clblasArgsToKextraFlags(&(syrk1->args), syrk1->funcID);
- syrk1->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
- syrk2->extraFlags = clblasArgsToKextraFlags(&(syrk2->args), syrk2->funcID);
- syrk2->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
-
- /* Tie the sequence syrk1 - syrk2 together. */
-
- syrk1->event = decomposeEventsAlloc();
- syrk1->node.next = &(syrk2->node);
-
- syrk2->numEventsInWaitList = 1;
- syrk2->eventWaitList = syrk1->event;
- syrk2->node.prev = &(syrk1->node);
-
- /* Insert new sequence instead of current step */
-
- syrk1->node.prev = step->node.prev;
- (syrk1->node.prev)->next = &(syrk1->node);
- step->node.prev = NULL;
-
- syrk2->node.next = step->node.next;
- (syrk2->node.next)->prev = &(syrk2->node);
- step->node.next = NULL;
-
- freeSolutionStep(&(step->node));
-
- /*
- * Now, decompose each of these steps to evaluate the diagonal
- * part in a dedicated kernel
- */
- decomposeSYRKStep(syrk1);
- node = decomposeSYRKStep(syrk2);
- return node;
-}