1 files changed, 0 insertions, 2367 deletions
diff --git a/external/clBLAS/src/library/blas/generic/solution_seq_make.c b/external/clBLAS/src/library/blas/generic/solution_seq_make.c
deleted file mode 100644
index 8a5e402d..00000000
--- a/external/clBLAS/src/library/blas/generic/solution_seq_make.c
+++ /dev/null
@@ -1,2367 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-#include <stdio.h>
-#include <assert.h>
-#include <sys/types.h>
-#include <math.h>
-#include <stdlib.h>
-
-#include <clblas_stddef.h>
-#include <clblas-internal.h>
-#include <toolslib.h>
-#include <events.h>
-
-#include "matrix_dims.h"
-#include "solution_assert.h"
-#include "solution_seq.h"
-
-#define DECOMPOSITION_THRESHOLD(type) (2560 * sizeof(cl_float) / dtypeSize(type))
-
-/* From solution_seq.c */
-bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole);
-void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step);
-
-#define isMatrixCached(pattern, mrole)                              \
-    checkMatrixMemLevelSet(pattern, mrole, (CLMEM_LEVEL_L2 | CLMEM_LEVEL_L1))
-
-#define isLdsUsed(pattern)                                          \
-    (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) ||  \
-     checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS))
-
-enum {
-    DEFAULT_BUFS_LSIZE_0 = 8,
-    DEFAULT_BUFS_LSIZE_1 = 8,
-    DEFAULT_CACHED_BUFS_LSIZE_0 = 8,
-    DEFAULT_CACHED_BUFS_LSIZE_1 = 8
-};
-
-static cl_uint getQueueMaxImages(cl_command_queue queue);
-
-static bool checkMatrixMemLevelSet(MemoryPattern *pattern, MatrixRole mrole,
-    meml_set_t mask);
-
-static void stripeDivision(BlasFunctionID funcID, const CLBlasKargs *args,
-    ListHead *seq, cl_uint totalCUs);
-static void rectDivision(BlasFunctionID funcID, const CLBlasKargs *args,
-    ListHead *seq, cl_uint totalCUs);
-static void triMatrixStripeDivision(BlasFunctionID funcID,
-    const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs);
-
-static cl_bool findBestPattern(SolutionStep *step);
-
-static void getDefaultStepGranulation(SolutionStep *step);
-static bool avoidLoadFromStorage(SolutionStep *step);
-
-static bool getStepResources(SolutionStep *step);
-static void getSuitableImageSizes(size_t *minWidth, size_t *minHeight,
-    size_t *bestHeight, MatrixRole mrole, CLBlasKargs *kargs, unsigned int vecLen,
-    SubproblemDim *subdims);
-
-static ListNode* decomposeTRXMStep(SolutionStep *step);
-static ListNode* decomposeSYRKStep(SolutionStep *step);
-static ListNode* decomposeSYR2KStep(SolutionStep *step);
-
-// Find vector length which lda and tile width is divisible on
-unsigned int
-appropriateVecLen(size_t ld, unsigned int tsize, size_t twidth, int funcLevel)
-{
-    unsigned int vlen = sizeof(cl_float4) / tsize;
-
-    if (funcLevel == 3) {
-        vlen *= 2;
-    }
-    while (vlen > twidth) {
-        vlen /= 2;
-    }
-
-    while ((ld % vlen) || (twidth % vlen)) {
-        vlen /= 2;
-    }
-
-    return vlen;
-}
-
-/*
- * Select an appropriate vectorization to perform computation with.
- * It's done based upon the problem sizes and device type. The device type
- * is taken into account as well since not all devices allow not aligned
- * access to vector data.
- */
-
-cl_int
-selectVectorization(
-    const SolutionStep *step,
-    CLBLASKernExtra *kextra)
-{
-    const TargetDevice *device = &step->device;
-    cl_device_type devType;
-    cl_int err;
-    size_t tw;
-    bool tra;
-    size_t checkedSizes[3];
-    int i, j;
-    const CLBlasKargs *kargs = &step->args;
-    KernelExtraFlags kflags = kextra->flags;
-    KernelExtraFlags vecFlags[3] = { KEXTRA_NO_COPY_VEC_A, KEXTRA_NO_COPY_VEC_B,
-                                     KEXTRA_NO_COPY_VEC_C };
-    unsigned int vlen;
-    unsigned int tsize;
-    MemoryPattern *mempat;
-    const SubproblemDim *dim = &step->subdims[1];
-    int funcLevel;
-
-    mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID];
-    err = clGetDeviceInfo(device->id, CL_DEVICE_TYPE, sizeof(devType),
-                          &devType, NULL);
-    if (err != CL_SUCCESS) {
-        return err;
-    }
-
-    if (isLdsUsed(mempat)) {
-        kextra->vecLenC = kextra->vecLen = sizeof(cl_float4) /
-                                                  dtypeSize(step->args.dtype);
-        kextra->vecLenA = kextra->vecLenB = kextra->vecLen;
-    }
-    else {
-        kextra->vecLenA = kextra->vecLenB = 0;
-    }
-
-    // select vectorization based upon leading dimensions and starting offsets
-    for (i = 0; i < 2; i++) {
-        if (!i) {
-           // check by leading dimensions
-           checkedSizes[0] = kargs->lda.matrix;
-           if (funcBlasLevel(step->funcID) == 2) {
-               checkedSizes[1] = checkedSizes[2] = 0;
-           }
-           else {
-               checkedSizes[1] = kargs->ldb.matrix;
-               checkedSizes[2] = kargs->ldc.matrix;
-           }
-        }
-        else {
-            // check by offsets
-            checkedSizes[0] = kargs->offA;
-            checkedSizes[1] = kargs->offBX;
-            checkedSizes[2] = kargs->offCY;
-        }
-
-        if (funcHasTriangMatrix(step->funcID)) {
-            checkedSizes[2] = checkedSizes[1];
-        }
-
-        vlen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
-
-        /*
-         * Disable vectorization at load from the global memory to LDS
-         * if matrix width is not aligned on the boundary of the float4
-         */
-        for (j = 0; j < 3; j++) {
-            if (checkedSizes[j] % vlen) {
-                kflags |= vecFlags[j];
-            }
-        }
-
-		if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV))
-		{
-		   if ( ( ((kflags & KEXTRA_UPPER_TRIANG)==0) && (kflags & KEXTRA_COLUMN_MAJOR) ) ||
-		        ( ((kflags & KEXTRA_UPPER_TRIANG)) && ((kflags & KEXTRA_COLUMN_MAJOR) == 0)) )
-
-			{
-				if( (kargs->N) % vlen)
-				{
-					kflags |= KEXTRA_NO_COPY_VEC_A;
-				}
-			}
-		}
-
-		if(mempat->sops->selectVectorization != NULL)
-		{
-			kflags |= mempat->sops->selectVectorization((void *)kargs, vlen);
-		}
-
-		if ((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
-		{
-			//
-			// TRTRI, GEMV Part - Only Scalar loads
-			// PENDING:
-			// Analyze Case by Case and selectively enable/disable
-			//
-			kflags |= KEXTRA_NO_COPY_VEC_A;
-			kflags |= KEXTRA_NO_COPY_VEC_B;
-		}
-
-		//
-		// Routines that Use LDS should be above this IF statement
-		//
-		if (isLdsUsed(mempat)) {
-            continue;
-        }
-
-		//
-		// Routines that dont use LDS have to be below the isLdsUsed() code
-		//
-		if (step->funcID == CLBLAS_GEMM2)
-		{
-			if ((step->subdims[0].y > step->args.M) || (step->subdims[0].x > step->args.N))
-			{
-				kextra->vecLen = 1;
-			} else {
-        	    kextra->vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
-			}
-        	kextra->vecLenA = kextra->vecLen;
-        	kextra->vecLenB = kextra->vecLen;
-        	kextra->vecLenC = kextra->vecLen;
-			continue;
-		}
-
-		if (step->funcID == CLBLAS_GEMM_TAIL)
-		{
-        	kextra->vecLen =  1;
-        	kextra->vecLenA = 1;
-        	kextra->vecLenB = 1;
-        	kextra->vecLenC = 1;
-			continue;
-		}
-    	funcLevel = funcBlasLevel(step->funcID);
-        funcLevel = funcBlasLevel(step->funcID);
-
-        /*
-         * If the step's pattern uses LDS, it is responsible for alignment.
-         * Otherwise it's needed to provide appropriate vector length
-         */
-        tsize = dtypeSize(step->args.dtype);
-        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_A);
-        tw = (tra) ? dim->y : dim->bwidth;
-        vlen = appropriateVecLen(checkedSizes[0], tsize, tw, funcLevel);
-        kextra->vecLenA = (kextra->vecLenA) ? umin(kextra->vecLenA, vlen) :
-                                              vlen;
-
-        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_B);
-        tw = ((funcLevel == 2) || !tra) ? dim->bwidth : dim->x;
-        vlen = appropriateVecLen(checkedSizes[1], tsize, tw, funcLevel);
-        kextra->vecLenB = (kextra->vecLenB) ? umin(kextra->vecLenB, vlen) :
-                                              vlen;
-
-        tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_C );
-        tw = ((funcLevel == 2) || tra) ? dim->y : dim->x;
-        vlen = appropriateVecLen( checkedSizes[2],
-            tsize,
-            tw,
-            funcLevel );
-        kextra->vecLenC = kextra->vecLenC ? umin(vlen,kextra->vecLenC) :
-                                            vlen;
-
-        kextra->vecLen = umin(kextra->vecLenA, kextra->vecLenB);
-        kextra->vecLen = umin(kextra->vecLenC, kextra->vecLen);
-    }
-
-    kextra->flags = kflags;
-
-    return CL_SUCCESS;
-}
-
-/*
- * Replace 'offsetM' and 'offsetN' field with respective extra offset at
- * 'offA', 'offBX', 'offCY' and taking into accoutn offset along K
- */
-void VISIBILITY_HIDDEN
-fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK)
-{
-    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) {
-        kargs->offA += offsetK * kargs->lda.matrix + kargs->offsetM;
-    }
-    else {
-        kargs->offA += kargs->offsetM * kargs->lda.matrix + offsetK;
-    }
-    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) {
-        kargs->offBX += offsetK * kargs->ldb.matrix + kargs->offsetN;
-    }
-    else {
-        kargs->offBX += kargs->offsetN * kargs->ldb.matrix + offsetK;
-    }
-    if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_C)) {
-        kargs->offCY += kargs->offsetN * kargs->ldc.matrix + kargs->offsetM;
-    }
-    else {
-        kargs->offCY += kargs->offsetM * kargs->ldc.matrix + kargs->offsetN;
-    }
-    kargs->offsetM = kargs->offsetN = 0;
-}
-
-ListNode
-*decomposeProblemStep(SolutionStep *step)
-{
-    ListNode *node;
-
-    switch (step->funcID) {
-    case CLBLAS_TRMM:
-    case CLBLAS_TRSM:
-        node = decomposeTRXMStep(step);
-        break;
-    case CLBLAS_SYRK:
-        node = decomposeSYRKStep(step);
-        break;
-    case CLBLAS_SYR2K:
-        node = decomposeSYR2KStep(step);
-        break;
-    default:
-        node = &step->node;
-        break;
-    }
-
-    return node;
-}
-
-cl_int
-makeSolutionSeq(
-    BlasFunctionID funcID,
-    const CLBlasKargs *args,
-    cl_uint numCommandQueues,
-    cl_command_queue *commandQueues,
-    cl_uint numEventsInWaitList,
-    const cl_event *eventWaitList,
-    cl_event *events,
-    ListHead *seq)
-{
-    cl_int err;
-    cl_uint j, totalCUs, numDevicesWithoutDoubles;
-    bool hasDouble;
-    SolutionStep *step;
-    CLBLASKernExtra extra;
-    ListNode *i;
-    MemoryPattern *pattern;
-    solver_id_t sid;
-    KernelKey key;
-    bool need[MAX_CLBLAS_KERNELS_PER_STEP] = {true};
-    CLBlasKernelType ktype;
-    Kernel *kernel;
-    bool loadData = false;
-    unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP];
-    size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP];
-    char bopts[BUILD_OPTS_MAXLEN]; // Moving bopts up. See the comments before findKernel()
-    int ik;
-    // first subdimension index in the subproblem dims array
-    int firstDimIdx;
-
-    if ((numCommandQueues == 0) || (commandQueues == NULL)) {
-        return CL_INVALID_VALUE;
-    }
-
-    memset(buffer, 0, sizeof(buffer));
-    listInitHead(seq);
-
-    totalCUs = 0;
-    numDevicesWithoutDoubles = 0;
-    for (j = 0; j < numCommandQueues; j++) {
-        cl_device_id devID;
-
-        err = getQueueDevice(commandQueues[j], &devID);
-        if (err != CL_SUCCESS) {
-            continue;
-        }
-        if (isDoubleBasedType(args->dtype)) {
-            hasDouble = deviceHasNativeDouble(devID, &err);
-            if (err != CL_SUCCESS) {
-                continue;
-            }
-            if (!hasDouble) {
-                numDevicesWithoutDoubles++;
-                continue;
-            }
-        }
-
-        step = calloc(1, sizeof(SolutionStep));
-        if (step == NULL) {
-            freeSolutionSeq(seq);
-            return CL_OUT_OF_HOST_MEMORY;
-        }
-
-        step->funcID = funcID;
-        step->args = *args;
-        step->args.addrBits = deviceAddressBits(devID, &err);
-        step->cmdQueue = commandQueues[j];
-        step->numEventsInWaitList = numEventsInWaitList;
-        step->eventWaitList = eventWaitList;
-        step->event = NULL;
-        if (events != NULL) {
-            step->event = events + j;
-        }
-        step->pgran.wfSize = deviceWavefront(devID, &err);
-        step->extraFlags = clblasArgsToKextraFlags(args, step->funcID);
-        if (step->funcID == CLBLAS_SYR2K) {
-            step->extraFlags |= KEXTRA_SYRK_2K_RANK;
-        }
-
-        step->device.id = devID;
-        err = identifyDevice(&step->device);
-        if (err != CL_SUCCESS) {
-            freeSolutionSeq(seq);
-            return err;
-        }
-
-        totalCUs += deviceComputeUnits(devID, &err);
-        listAddToTail(seq, &step->node);
-    }
-    if (totalCUs == 0) {
-        return (numDevicesWithoutDoubles == numCommandQueues) ?
-                    CL_INVALID_DEVICE : CL_INVALID_COMMAND_QUEUE;
-    }
-
-    memset(&extra, 0, sizeof(extra));
-    memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
-    extra.dtype = args->dtype;
-
-    /* Split task between multiple command queues */
-
-    if (funcID == CLBLAS_GEMM) {
-        rectDivision(funcID, args, seq, totalCUs);
-    }
-    else if ((funcID == CLBLAS_SYRK) || (funcID == CLBLAS_SYR2K)) {
-        triMatrixStripeDivision(funcID, args, seq, totalCUs);
-    }
-    else {
-        stripeDivision(funcID, args, seq, totalCUs);
-    }
-
-    /* Some steps can be decomposed into several sequential substeps */
-
-    parseEnvImplementation();
-
-    // Function level decomposition
-    for (i = listNodeFirst(seq); i != seq; i = i->next) {
-        step = container_of(i, node, SolutionStep);
-        if (step->cmdQueue == NULL) {
-            continue;
-        }
-
-        if (step->funcID == CLBLAS_GEMM) {
-            fixupGemmOffsets(&step->args, step->extraFlags, 0);
-            continue;
-        }
-
-        i = decomposeProblemStep(step);
-    }
-
-	#ifdef DEBUG_2
-	printf("Finding a kernel for each step\n");
-	#endif
-
-    /* Find a kernel for each step */
-
-    for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS);
-         i = i->next) {
-
-        DeviceIdent *ident;
-
-        step = container_of(i, node, SolutionStep);
-        if (step->cmdQueue == NULL) {
-            continue;
-        }
-
-        ident = &step->device.ident;
-
-        /*
-         * Set vendor dependent flags
-         *
-         * FIXME: thrown this kludge away when generator interface will
-         *        support passing ident info
-         */
-        if (ident->vendor == VENDOR_AMD) {
-            step->extraFlags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD);
-        }
-
-        if (!findBestPattern(step)) {
-            err = CL_OUT_OF_RESOURCES;
-            break;
-        }
-
-		#ifdef DEBUG_2
-		printf("Find best pattern finished\n");
-		#endif
-
-
-        pattern = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
-        firstDimIdx = 2 - pattern->nrLevels;
-        sid = makeSolverID(step->funcID, step->patternID);
-
-        err = getQueueDevice(step->cmdQueue, &key.device);
-        err = getQueueContext(step->cmdQueue, &key.context);
-
-        detectProblemTails(step);
-
-        extra.flags = step->extraFlags;
-        if (pattern->sops->fixupArgs) {
-            pattern->sops->fixupArgs(&step->args, &step->subdims[firstDimIdx],
-                                     &extra);
-        }
-        step->extraFlags = extra.flags;
-
-        key.nrDims = pattern->nrLevels;
-        memset(key.subdims, 0, sizeof(key.subdims));
-        memcpy(key.subdims, &step->subdims[firstDimIdx],
-               sizeof(SubproblemDim) * key.nrDims);
-
-        detectOffsets(step);
-
-        extra.flags = step->extraFlags;
-
-        need[CLBLAS_PREP_A_KERNEL] = isMatrixInImage(pattern, MATRIX_A);
-        need[CLBLAS_PREP_B_KERNEL] = isMatrixInImage(pattern, MATRIX_B);
-
-        /*
-         * Now, find and enqueue each kernel. Generate and build the kernel
-         * on the fly if this kernel is not presented neither in the cache
-         * no in the storage
-         */
-        for (ktype = CLBLAS_COMPUTING_KERNEL;
-             ktype < MAX_CLBLAS_KERNELS_PER_STEP; ktype++) {
-			 SubproblemDim prepDims[2];
-
-            if (!need[ktype]) {
-                continue;
-            }
-
-            extra.kernType = ktype;
-
-            err = selectVectorization(step, &extra);
-            if (err != CL_SUCCESS) {
-                break;
-            }
-
-            kernel = NULL;
-
-            //
-            // Now that the build options is a part of EXTRA structure,
-            // it is also a part of the kernelKey
-            // Setting of build options need to be done before
-            // findKernel()
-            //
-            memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char));
-            setupBuildOpts(bopts, key.device, pattern);
-            if (pattern->sops->setBuildOptions)
-            {
-                pattern->sops->setBuildOptions(bopts, (void*)(step));
-            }
-            memcpy(extra.buildOptions, bopts, BUILD_OPTS_MAXLEN);
-
-            if (areKernelsCacheable()) {
-                kernel = findKernel(clblasKernelCache, sid, &key, &extra);
-            }
-            if (kernel == NULL) {
-                if (!loadData && !avoidLoadFromStorage(step)) {
-                    size_t MNK = (step->args.M + step->args.N + step->args.K) / 3;
-                    loadData = !getKernelInfo(&step->device, pattern->name,
-                        extra.dtype, step->extraFlags, (int)MNK, &buffer[0],
-                        &sizeBuffer[0]);
-                }
-                if (buffer[ktype] != NULL){
-                    kernel = loadKernel((const unsigned char**)&buffer[ktype],
-                                        sizeBuffer[ktype], &key, &extra, &err);
-                }
-                else {
-                    SubproblemDim *dims;
-
-                    dims = (ktype == CLBLAS_COMPUTING_KERNEL) ? step->subdims :
-                                                                prepDims;
-
-					#ifdef DEBUG_2
-					printf("Build options used : %s\n", bopts);
-					#endif
-
-                    kernel = makeKernel(key.device, key.context,
-                                        pattern->sops->genKernel,
-                                        &dims[firstDimIdx], &step->pgran,
-                                        &extra, bopts, &err);
-                }
-
-                if (kernel == NULL) {
-                    break;
-                }
-
-                if (areKernelsCacheable()) {
-                    getKernel(kernel);
-                    if (addKernelToCache(clblasKernelCache, sid, kernel, &key,
-                                         clblasKernelExtraCmp)) {
-                        putKernel(clblasKernelCache, kernel);
-                    }
-                }
-            } else {
-				#ifdef DEBUG_CONTEXT
-				printf("KERNEL FOUND IN CACHE\n");
-				#endif
-			}
-            step->kernels[ktype] = kernel;
-        }
-    }
-
-    if (err != CL_SUCCESS) {
-        freeSolutionSeq(seq);
-    }
-
-    // free binary kernels
-    for (ik = 0; ik < MAX_CLBLAS_KERNELS_PER_STEP; ++ik) {
-        free(buffer[ik]);
-    }
-    return err;
-}
-
-static cl_uint
-getQueueMaxImages(cl_command_queue queue)
-{
-    cl_int err;
-    cl_device_id device;
-    cl_command_queue_properties props;
-    cl_bool imageSupport;
-
-    imageSupport = CL_FALSE;
-    err = getQueueDevice(queue, &device);
-    if (err != CL_SUCCESS) {
-        return 0;
-    }
-    err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(imageSupport),
-        &imageSupport, NULL);
-    if (!imageSupport) {
-        return 0;
-    }
-
-    props = 0;
-    err = getQueueProperties(queue, &props);
-    if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
-        return 0;
-    }
-
-    return 2;
-}
-
-static bool
-isTransBUsed(BlasFunctionID funcID)
-{
-    if ((CLBLAS_GEMM == funcID) || (CLBLAS_GEMM2 == funcID) || (CLBLAS_GEMM_TAIL == funcID)) {
-        return true;
-    }
-    else {
-        return false;
-    }
-}
-
-KernelExtraFlags
-clblasArgsToKextraFlags(const CLBlasKargs *args, BlasFunctionID funcID)
-{
-    KernelExtraFlags flags = KEXTRA_NO_FLAGS;
-
-    if (args->transA != clblasNoTrans) {
-        flags |= KEXTRA_TRANS_A;
-    }
-
-    if (isTransBUsed(funcID) && args->transB != clblasNoTrans) {
-        flags |= KEXTRA_TRANS_B;
-    }
-
-    if (isComplexType(args->dtype)) {
-        if (args->transA == clblasConjTrans) {
-            flags |= KEXTRA_CONJUGATE_A;
-        }
-        if (isTransBUsed(funcID) && args->transB == clblasConjTrans) {
-            flags |= KEXTRA_CONJUGATE_B;
-        }
-    }
-
-    if (args->order == clblasColumnMajor) {
-        flags |= KEXTRA_COLUMN_MAJOR;
-    }
-    if ((funcID != CLBLAS_TRMM) && (funcID != CLBLAS_TRSM)) {
-        // check if beta is zero
-        ArgMultiplier z;
-
-        memset(&z, 0, sizeof(z));
-        if (!memcmp(&args->beta, &z, sizeof(z))) {
-            flags |= KEXTRA_BETA_ZERO;
-        }
-    }
-
-    if (funcID != CLBLAS_GEMM) {
-        if (args->uplo == clblasUpper) {
-            flags |= KEXTRA_UPPER_TRIANG;
-        }
-        if (args->side == clblasRight) {
-            flags |= KEXTRA_SIDE_RIGHT;
-        }
-        if (args->diag == clblasUnit) {
-            flags |= KEXTRA_UNIT_DIAGONAL;
-        }
-    }
-    if (funcID == CLBLAS_GEMV || funcID == CLBLAS_SYMV) {
-        if (args->ldb.vector == 1) {
-            flags |= KEXTRA_INCX_ONE;
-        }
-        if (args->ldc.vector == 1) {
-            flags |= KEXTRA_INCY_ONE;
-        }
-    }
-
-    return flags;
-}
-
-static bool
-checkMatrixMemLevelSet(
-    MemoryPattern *pattern,
-    MatrixRole mrole,
-    meml_set_t mask)
-{
-    const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra;
-    meml_set_t mset;
-
-    if (mrole == MATRIX_C || extra == NULL) {
-        return false;
-    }
-
-    switch (mrole) {
-    case MATRIX_A:
-        mset = extra->aMset;
-        break;
-    case MATRIX_B:
-        mset = extra->bMset;
-        break;
-    default:
-        break;
-    }
-
-    return ((mset & mask) != 0);
-}
-
-/* Next three functions: stripeDivision(), rectDivision() and
- * triMatrixStripeDivision(), split output matrix into set of non-intersected
- * rectangles. Area of each rectangle depends on the number of Compute Units,
- * available on a device of the given queue.
- * Division is also aligned on the DIVISION_ALIGNMENT boundary. It is measured
- * in number of elements.
- */
-
-/* This constant is used in:
- *     - stripeDivision()
- *     - rectDivision()
- *     - triMatrixStripeDivision()
- *     - decomposeTRXMStep()
- */
-static const size_t DIVISION_ALIGNMENT = 128;
-
-static size_t
-align(
-    size_t value,
-    size_t alignment)
-{
-    /* This implementation assumes that alignment is the power of 2. */
-    return (value + (alignment >> 1)) & (~(alignment - 1));
-}
-
-/* Stripe division is done according to the picture:
- *
- *      +------+--+----+--+
- *      |      |  |    |  |
- *      |      |  |    |  |
- *      |  1   | 2|  3 | 4|
- *      |      |  |    |  |
- *      |      |  |    |  |
- *      +------+--+----+--+
- */
-static void
-stripeDivision(
-    BlasFunctionID funcID,
-    const CLBlasKargs *args,
-    ListHead *seq,
-    cl_uint totalCUs)
-{
-    SolutionStep *step;
-    ListNode *i;
-    cl_int err;
-    cl_device_id device;
-    cl_uint nrCU;
-    SubproblemDim size, offset, stepSize;
-    bool first = true;
-
-    kargsToProbDims(&offset, funcID, args, true);
-    kargsToProbDims(&size, funcID, args, false);
-
-    for (i = listNodeFirst(seq); i != seq; i = i->next) {
-        step = container_of(i, node, SolutionStep);
-        err = getQueueDevice(step->cmdQueue, &device);
-        nrCU = deviceComputeUnits(device, &err);
-
-        if (totalCUs == 0) {
-            step->cmdQueue = NULL;
-            continue;
-        }
-
-        stepSize = size;
-        if (!first) {
-            probDimsToKargs(&(step->args), funcID, &offset, true);
-        }
-
-        if (funcID == CLBLAS_GEMV) {
-            if (totalCUs != nrCU) {
-                stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
-                stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
-                if (stepSize.y == 0) {
-                    step->cmdQueue = NULL;
-                }
-                else if (stepSize.y > size.y) {
-                    stepSize.y = size.y;
-                    totalCUs = nrCU;
-                }
-            }
-
-            offset.y += stepSize.y;
-            size.y -= stepSize.y;
-        }
-        else {
-            if (totalCUs != nrCU) {
-                stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
-                stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
-                if (stepSize.x == 0) {
-                    step->cmdQueue = NULL;
-                }
-                else if (stepSize.x > size.x) {
-                    stepSize.x = size.x;
-                    totalCUs = nrCU;
-                }
-            }
-            offset.x += stepSize.x;
-            size.x -= stepSize.x;
-        }
-
-        totalCUs -= nrCU;
-        probDimsToKargs(&(step->args), funcID, &stepSize, false);
-        first = false;
-    }
-}
-
-/* Rectangular division is done according to the picture:
- *
- *      +------+-----+
- *      |      |  2  |
- *      |      |     |
- *      |  1   +--+--+
- *      |      |3 | 4|
- *      |      |  |  |
- *      +------+--+--+
- *
- * The longest side is divided first.
- */
-static void
-rectDivision(
-     BlasFunctionID funcID,
-     const CLBlasKargs *args,
-     ListHead *seq,
-     cl_uint totalCUs)
- {
-     SolutionStep *step, **sortedSteps;
-     ListNode *i, *j;
-     cl_int err;
-     cl_device_id device;
-     cl_uint nrCU, k, l;
-     SubproblemDim size, offset, stepSize;
-     unsigned int nrSteps = 0;
-
-     /* 1. Sort steps according to the number of CU they have */
-     /* NOTE: We expect small number of steps, so simple insertion sort
-      *       would be enough.
-      */
-
-     sortedSteps = calloc(listLength(seq), sizeof(*sortedSteps));
-     // assert(sortedSteps != NULL);
-
-     k = 0;
-     for (i = listNodeFirst(seq); i != seq; i = i->next, nrSteps++) {
-         step = container_of(i, node, SolutionStep);
-         err = getQueueDevice(step->cmdQueue, &device);
-
-         sortedSteps[k] = step;
-         nrCU = deviceComputeUnits(device, &err);
-
-         for (j = i->next; j != seq; j = j->next) {
-             step = container_of(i, node, SolutionStep);
-             err = getQueueDevice(step->cmdQueue, &device);
-
-             if (nrCU < deviceComputeUnits(device, &err)) {
-                 sortedSteps[k] = step;
-                 nrCU = deviceComputeUnits(device, &err);
-             }
-         }
-
-         k++;
-     }
-
-     /* 2. Calculate rectangle sizes */
-
-     kargsToProbDims(&offset, funcID, args, true);
-     kargsToProbDims(&size, funcID, args, false);
-     stepSize = size;
-
-     for (l = 0; l < k; l++) {
-         step = sortedSteps[l];
-         err = getQueueDevice(step->cmdQueue, &device);
-         nrCU = deviceComputeUnits(device, &err);
-
-         if (totalCUs == 0) {
-             step->cmdQueue = NULL;
-             continue;
-         }
-
-         stepSize = size;
-         if (l) {
-             probDimsToKargs(&(step->args), funcID, &offset, true);
-         }
-
-         if (size.y > size.x) {
-             if (totalCUs != nrCU) {
-                 stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5);
-                 stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
-                 if (stepSize.y > size.y) {
-                     stepSize.y = size.y;
-                     totalCUs = nrCU;
-                 }
-                 else if (stepSize.y == 0) {
-                     step->cmdQueue = NULL;
-                 }
-             }
-             size.y -= stepSize.y;
-             offset.y += stepSize.y;
-         }
-         else {
-             if (totalCUs != nrCU) {
-                 stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5);
-                 stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT);
-                 if (stepSize.x > size.x) {
-                     stepSize.x = size.x;
-                     totalCUs = nrCU;
-                 }
-                 else if (stepSize.x == 0) {
-                     step->cmdQueue = NULL;
-                 }
-             }
-             size.x -= stepSize.x;
-             offset.x += stepSize.x;
-         }
-
-         probDimsToKargs(&(step->args), funcID, &stepSize, false);
-
-         #ifdef DEBUG_2
-         printf("RectDivision:\n");
-         printf("\t offM=%d, offN=%d, M=%d, N=%d\n", step->args.offsetM, step->args.offsetN, step->args.M, step->args.N);
-         #endif
-         totalCUs -= nrCU;
-     }
-
-     free(sortedSteps);
-}
-
-/* Dividing triangular matrix (N x N) horizontally:
- *
- *      +----+
- *      |\   |
- *      +-\--+
- *      |  \ |
- *      |   \|
- *      +----+
- *
- * Take into consideration the areas of triangles/trapezoids rather than
- * areas of stripes.
- */
-static void
-triMatrixStripeDivision(
-    BlasFunctionID funcID,
-    const CLBlasKargs *args,
-    ListHead *seq,
-    cl_uint totalCUs)
-{
-    SolutionStep *step;
-    ListNode *i;
-    cl_int err;
-    cl_device_id device;
-    cl_uint nrCU;
-    SubproblemDim size, offset, stepSize, stepOffset;
-    size_t top;
-
-    kargsToProbDims(&offset, funcID, args, true);
-    kargsToProbDims(&size, funcID, args, false);
-    top = 0;
-
-    if (args->uplo == clblasUpper) {
-        offset.y += size.y;
-    }
-    stepSize = size;
-
-    for (i = listNodeFirst(seq); i != seq; i = i->next) {
-        step = container_of(i, node, SolutionStep);
-        err = getQueueDevice(step->cmdQueue, &device);
-        nrCU = deviceComputeUnits(device, &err);
-
-        if (totalCUs == 0) {
-            step->cmdQueue = NULL;
-            continue;
-        }
-
-        if (args->uplo == clblasLower) {
-            stepOffset = offset;
-        }
-
-        if (totalCUs != nrCU) {
-            stepSize.y = (size_t)(
-                sqrt(top * top + (double)nrCU / totalCUs * size.y * (top + size.x)) - top);
-            stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT);
-            if ((stepSize.y == 0) || (stepSize.y > size.y)) {
-                stepSize.y = size.y;
-                totalCUs = nrCU;
-            }
-            else if (stepSize.y == 0) {
-                step->cmdQueue = NULL;
-            }
-            /* We have to add special check because the direction of
-             * splitting is 'bottom -> top' for UPLO = clblasUpper.
-             */
-            else if (offset.y != align(offset.y, DIVISION_ALIGNMENT)) {
-                size_t o = align(offset.y - stepSize.y, DIVISION_ALIGNMENT);
-                if (o > offset.y) {
-                    o -= 2 * DIVISION_ALIGNMENT;
-                }
-                stepSize.y = offset.y - o;
-            }
-        }
-        else {
-            stepSize.y = size.y;
-        }
-
-        size.y -= stepSize.y;
-        top += stepSize.y;
-        if (args->uplo == clblasLower) {
-            offset.y += stepSize.y;
-        }
-        else {
-            offset.y -= stepSize.y;
-            stepOffset = offset;
-        }
-
-        probDimsToKargs(&(step->args), funcID, &stepOffset, true);
-        probDimsToKargs(&(step->args), funcID, &stepSize, false);
-
-        totalCUs -= nrCU;
-    }
-}
-
-static cl_bool
-findBestPattern(SolutionStep *step)
-{
-    cl_uint maxImages;
-
-    maxImages = getQueueMaxImages(step->cmdQueue);
-
-    do {
-        /* It may be non first attempt. Ensure that there are not
-         * hold images for this step
-         */
-        releaseStepImgs(step);
-
-        step->patternID = selectPattern( step, maxImages );
-
-        assert(step->patternID != (unsigned int)-1);
-
-		#ifdef DEBUG_2
-		printf("select Pattern Done\n");
-		#endif
-
-        getStepGranulation(step);
-		#ifdef DEBUG_2
-		printf("getStepGranulation done \n");
-		#endif
-
-        assertGranulation(step->subdims, mempat->nrLevels,
-                          &step->pgran, mempat->thLevel);
-        if (getStepResources(step))
-            break;
-    } while (maxImages-- != 0);
-
-    return (maxImages != (cl_uint)-1) ? CL_TRUE : CL_FALSE;
-}
-
-void
-detectProblemTails(SolutionStep *step)
-{
-    SubproblemDim globDim, offDim;
-    SubproblemDim *subdim;
-    KernelExtraFlags kflags = KEXTRA_NO_FLAGS;
-
-    subdim = step->subdims;
-
-    kargsToProbDims(&globDim, step->funcID, &step->args, false);
-    kargsToProbDims(&offDim, step->funcID, &step->args, true);
-
-	#ifdef DEBUG_2
-	printf("detectProblemTails: subdimy=%d, subdimx=%d, subdimBwidth=%d\n", subdim->y, subdim->x, subdim->bwidth);
-	#endif
-    if (globDim.y % subdim->y) {
-        kflags |= KEXTRA_TAILS_M;
-    }
-    if (globDim.x % subdim->x) {
-        kflags |= KEXTRA_TAILS_N;
-    }
-    if (globDim.bwidth % subdim->bwidth) {
-        kflags |= KEXTRA_TAILS_K;
-    }
-    if (clblasSolvers[step->funcID].memPatterns[step->patternID].nrLevels > 1) {
-        if (globDim.y % subdim[1].y) {
-            kflags |= KEXTRA_TAILS_M_LOWER;
-        }
-        if (globDim.x % subdim[1].x) {
-            kflags |= KEXTRA_TAILS_N_LOWER;
-        }
-        if (globDim.bwidth % subdim[1].bwidth) {
-            kflags |= KEXTRA_TAILS_K_LOWER;
-        }
-    }
-    else {
-        kflags |= (kflags & KEXTRA_TAILS_M) != 0 ? KEXTRA_TAILS_M_LOWER : 0;
-        kflags |= (kflags & KEXTRA_TAILS_N) != 0 ? KEXTRA_TAILS_N_LOWER : 0;
-        kflags |= (kflags & KEXTRA_TAILS_K) != 0 ? KEXTRA_TAILS_K_LOWER : 0;
-    }
-
-    // clean tails flags
-    step->extraFlags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K
-                          | KEXTRA_TAILS_M_LOWER
-                          | KEXTRA_TAILS_N_LOWER
-                          | KEXTRA_TAILS_K_LOWER);
-    // set tails flags
-    step->extraFlags |= kflags;
-}
-
-void
-detectOffsets(SolutionStep *step)
-{
-    const CLBlasKargs *args = &(step->args);
-    KernelExtraFlags kflags = step->extraFlags;
-
-    if (args->offsetM) {
-        kflags |= KEXTRA_STARTM_NOT_ZERO;
-    }
-    if (args->offsetN) {
-        kflags |= KEXTRA_STARTN_NOT_ZERO;
-    }
-    if (args->offA) {
-        kflags |= KEXTRA_A_OFF_NOT_ZERO;
-    }
-    if (args->offBX) {
-        kflags |= KEXTRA_BX_OFF_NOT_ZERO;
-    }
-    if (args->offCY) {
-        kflags |= KEXTRA_CY_OFF_NOT_ZERO;
-    }
-
-    step->extraFlags = kflags;
-}
-
-//-----------------------------------------------------------------------------
-
-static unsigned int
-legacySelectPattern(
-    BlasFunctionID funcID,
-    unsigned int maxImages)
-{
-    unsigned int id, i, n;
-    MatrixRole mrole;
-    MemoryPattern *pat;
-    int score, maxScore = -1;
-
-    id = -1;
-    /*
-     * Lookup all patterns, and assign a score per each matrix for
-     * each pattern:
-     * 0 - matrix is not cached
-     * 2 - matrix is cached and stored in an image
-     * 3 - matrix is cached and not stored in an image
-     *
-     * Find the pattern with the best score
-     */
-    pat = clblasSolvers[funcID].memPatterns;
-
-    for (i = 0; i < clblasSolvers[funcID].nrPatterns; i++, pat++) {
-        score = 0;
-        n = 0;
-
-        for (mrole = MATRIX_A; mrole <= MATRIX_B; mrole++) {
-            if (isMatrixCached(pat, mrole)) {
-                if (isMatrixInImage(pat, mrole)) {
-                    n++;
-                    score += 2;
-                }
-                else {
-                    score += 3;
-                }
-            }
-        }
-
-        if (n > maxImages) {
-            continue;
-        }
-
-        if (score > maxScore) {
-            maxScore = score;
-            id = i;
-        }
-    }
-
-    return id;
-}
-//-----------------------------------------------------------------------------
-
-unsigned int
-selectPattern( SolutionStep* pStep,
-    unsigned int maxImages )
-{
-    unsigned int i = 0;
-    int selPatt = -1;
-    int perf = -1;
-    int maxPerf = -1;
-    int funcID = pStep->funcID;
-    unsigned int kflags = pStep->extraFlags;
-
-    if (clblasSolvers[funcID].defaultPattern != -1) {
-// assert(clblasSolvers[funcID].defaultPattern < clblasSolvers[funcID].nrPatterns);
-        return clblasSolvers[funcID].defaultPattern;
-    }
-
-	// select best-performing pattern for current case
-	for( i = 0; i < clblasSolvers[funcID].nrPatterns; i++ ){
-
-		if( NULL != clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf ){
-
-	        perf = clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf(
-                kflags,
-                (void*)&pStep->args);
-
-            if( perf > maxPerf ){
-                selPatt = i;
-                maxPerf = perf;
-            }
-	    }
-	    // if not all patterns provide performace estimation functions
-	    // use legacy pattern selection
-	    else{
-	        return legacySelectPattern( funcID, maxImages );
-	    }
-	}
-
-    return selPatt;
-}
-
-//-----------------------------------------------------------------------------
-
-/*
- * Check if tile sizes exceed the entire problem and adjust them
- * accordingly if yes
- */
-bool
-dimensionsExceedProblemSize(SolutionStep *step) {
-    SubproblemDim probDim;
-    SubproblemDim *dims = step->subdims;
-    BlasFunctionID funcID = step->funcID;
-    MemoryPattern *mempat =
-            &clblasSolvers[funcID].memPatterns[step->patternID];
-
-    /*
-     * Looks like kernels of other functions handle the case themselves
-     * and don't expect that everyone can adjust chosen decomposition
-     */
-    if (!( (funcID == CLBLAS_GEMV) ||
-           (funcID == CLBLAS_SYMV) ||
-           (funcID == CLBLAS_GEMM) ||
-           (funcID == CLBLAS_TRMM) ||
-           (funcID == CLBLAS_TRSM) ||
-           (funcID == CLBLAS_SYRK) ||
-           (funcID == CLBLAS_SYR2K)) ) {
-
-        return false;
-    }
-
-
-    kargsToProbDims(&probDim, step->funcID, &step->args, false);
-
-    if (mempat->nrLevels != 2) {
-        return false;
-    }
-    dims = &dims[1];
-
-    if (dims->x > probDim.x ||
-        dims->y > probDim.y ||
-        dims->bwidth > probDim.bwidth) {
-        return true;
-    }
-
-    return false;
-}
-
-void
-getMinimalStepGranulation(SolutionStep *step)
-{
-    SubproblemDim *decompDims = NULL;
-    SubproblemDim probDims[2];
-    size_t factor = 0;
-
-    // EINVAL
-    if( NULL == step ){
-        return;
-    }
-
-    if (step->funcID == CLBLAS_GEMM2)
-	{
-		return;
-	}
-
-    kargsToProbDims( probDims, step->funcID, &step->args, false);
-    decompDims = step->subdims;
-
-    // All exceeding dimensions are set to 1
-
-     if ( decompDims[1].itemX > probDims->x ) {
-
-         factor = decompDims[1].itemX;
-         decompDims[1].itemX = 1;
-         decompDims[1].x /= factor;
-         decompDims[0].itemX /= factor;
-         decompDims[0].x /= factor;
-     }
-
-     if ( decompDims[1].itemY > probDims->y ) {
-
-         factor = decompDims[1].itemY;
-         decompDims[1].itemY = 1;
-         decompDims[1].y /= factor;
-         decompDims[0].itemY /= factor;
-         decompDims[0].y /= factor;
-     }
-
-     if( decompDims[1].bwidth > probDims->bwidth ){
-         decompDims[0].bwidth /= decompDims[1].bwidth;
-         decompDims[1].bwidth = 1;
-     }
-}
-
-void
-getStepGranulation(SolutionStep *step)
-{
-	SubproblemDim *dims = step->subdims;
-    cl_device_id devID;
-    double time;
-    int status = GF_ERROR;
-    size_t MNK;
-
-	#ifdef DEBUG_2
-	printf("getStepGranulation called........\n");
-	#endif
-
-    MemoryPattern *mempat =
-            &clblasSolvers[step->funcID].memPatterns[step->patternID];
-
-	#ifdef DEBUG_2
-	printf("Got mempat structure.........0x%p\n", mempat);
-	#endif
-
-
-	#ifdef DEBUG_2
-	if ( mempat == NULL)
-	{
-		printf("mempat pointer is NULL...\n");
-	} else {
-		printf("mempat pointer is non-null..\n");
-		if (mempat->sops == NULL)
-			printf("sops is NULL\n");
-		else
-			if (mempat->sops->getFlags == NULL)
-				printf("getFlags() is NULL\n");
-		fflush(stdout);
-	}
-	#endif
-
-	getQueueDevice(step->cmdQueue, &devID);
-
-	#ifdef DEBUG_2
-	printf("QueueDevice done...\n");
-	#endif
-
-
-    // try to load decomposition info from the storage
-
-    /*
-     * FIXME: It's a workaround so that to avoid getting some decomposition
-     *         sizes leading to strange hang ups
-     */
-    if (!avoidLoadFromStorage(step)) {
-		#ifdef DEBUG_2
-		printf("!avoidLoadFromStorage...Inside if\n");
-		#endif
-
-        MNK = (step->args.M + step->args.N + step->args.K)/3;
-        if (mempat->sops->innerDecompositionAxis) {
-            size_t ld;
-            // bas - banks aligned size, in bytes, should be
-            // number of channels * bytes per channel
-            // here it is set to 8*256 = 2048 = 512 floats
-            size_t bas = 8*256;
-            if (mempat->sops->innerDecompositionAxis(&step->args) ==
-                    DECOMP_AXIS_X) {
-                ld = step->args.ldb.matrix;
-            }
-            else {
-                ld = step->args.lda.matrix;
-            }
-
-            if ((ld * dtypeSize(step->args.dtype)) % bas == 0) {
-                //special bad case
-                MNK = 0;
-            }
-        }
-
-		if( step->funcID != CLBLAS_GEMM2 )
-		{
-			status = getGranularityInfo(&step->device, mempat->name,
-										step->args.dtype, step->extraFlags,
-										(int)MNK, dims, &step->pgran, &time);
-		}
-        /*
-         * Disable blocking for implementations dealing with cache reads
-         * from the global memory
-         */
-        //if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
-        //    dims[0].bwidth = dims[1].bwidth;
-        //}
-    }
-	#ifdef DEBUG_2
-	printf("isLoadFromStorage done..\n");
-	#endif
-
-	//Query solver for default granulation
-    if (status == GF_ERROR) {
-		// temporary mock, untill all solvers will return required default problem granulation
-		// TODO: deprecate the getDefaultStepGranulation(step) function
-		if(NULL==mempat->sops->getDefaultDecomp)
-		{
-			getDefaultStepGranulation(step);
-		}
-		else
-		{
-			mempat->sops->getDefaultDecomp( &step->pgran,
-			    step->subdims,
-			    MAX_SUBDIMS,
-			    (void*)&step->args);
-		}
-    }
-    if (dimensionsExceedProblemSize(step)) {
-        getMinimalStepGranulation(step);
-    }
-}
-
-void
-getDefaultStepGranulation(SolutionStep *step)
-{
-    unsigned int nrFloats;
-    MemoryPattern *mempat =
-            &clblasSolvers[step->funcID].memPatterns[step->patternID];
-    SubproblemDim *dims = step->subdims;
-    cl_ulong ldsSize;
-    size_t wgX, wgY;
-    bool square;
-    SDimComponent component = SDIM_BWIDTH;
-    DataType dtype = step->args.dtype;
-    size_t tsize = dtypeSize(dtype);
-    unsigned int i;
-    SolverFlags sflags;
-    unsigned int bcoeff;
-    bool bothCached, fixedBw = false;
-    cl_device_id devID;
-    PGranularity *pgran = &step->pgran;
-	size_t maxWorkGroupSize;
-	int vecLen;
-	size_t subdimyFactor = 1;
-	size_t subdimxFactor = 1;
-
-	#ifdef DEBUG_2
-	printf("getDefaultStepGranualtion called...\n");
-	#endif
-    nrFloats = (unsigned int)(dtypeSize(dtype) / sizeof(cl_float));
-    square = ((mempat->sops->getFlags() & SF_TOP_INPUT_SQUARE_BLOCKS) != 0);
-    bothCached = isMatrixCached(mempat, MATRIX_A) &&
-                 isMatrixCached(mempat, MATRIX_B);
-    if (step->cmdQueue != NULL) {
-        getQueueDevice(step->cmdQueue, &devID);
-    }
-    else {
-        devID = step->device.id;
-    }
-    clGetDeviceInfo(devID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ldsSize),
-                    &ldsSize, NULL);
-	clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
-					sizeof(size_t), &maxWorkGroupSize, NULL);
-
-    /*
-     * Setup dimensions allowing to use more or less effectively the local
-     * memory or cache;
-     */
-
-    if (square) {
-        dims[0].x = (dtype == TYPE_COMPLEX_DOUBLE) ? 16 : 32;
-        /*
-         * FIXME: for now, we restrict ourselves with square blocks due
-         *        to compilation issues
-         */
-        dims[0].y = dims[0].x; //(dtype == TYPE_FLOAT) ? 32 : 16
-        dims[0].bwidth = dims[0].y;
-        bcoeff = nrFloats;
-        wgY = DEFAULT_BUFS_LSIZE_0;
-        wgX = DEFAULT_BUFS_LSIZE_1;
-	} else {
-        bcoeff = (dtype == TYPE_COMPLEX_DOUBLE) ? 2 : 1;
-
-        if (bothCached) {
-            wgY = DEFAULT_CACHED_BUFS_LSIZE_0;
-            wgX = DEFAULT_CACHED_BUFS_LSIZE_1;
-        }
-        else {
-            wgY = DEFAULT_BUFS_LSIZE_0;
-            wgX = DEFAULT_BUFS_LSIZE_1;
-        }
-
-		if (step->funcID == CLBLAS_GEMM2)
-		{
-			subdimyFactor = 2;
-			subdimxFactor = 1;
-            bcoeff = 4; // 16/bcoeff = 4 - Thats the panel width we want
-		}
-
-   		if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV))  {
-			if (maxWorkGroupSize >= 256)
-			{
-				wgX = 16;
-				wgY = 16;
-			} else if (maxWorkGroupSize >= 128)
-			{
-				wgX = 8;
-				wgY = 16;
-			} else {
-				//
-				// PENDING: What if maxWorkGroupSize < 64 ????
-				//
-				wgX = 8;
-				wgY = 8;
-			}
-		}
-
-
-        /*
-         * Set block sizes such so the work group would access the whole
-         * memory channel or not exceed cache associativity for the modern
-         * AMD GPU families.
-         *
-         * FIXME: throw the hardcoded constants away
-         */
-        if (isMatrixInImage(mempat, MATRIX_A) ||
-            isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_A)) {
-
-            dims[0].y = (64 * subdimyFactor) / nrFloats;
-            fixedBw = true;
-        }
-        else {
-            dims[0].y = (32 * subdimyFactor);
-        }
-
-        if (isMatrixInImage(mempat, MATRIX_B) ||
-            isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
-
-            dims[0].x = (64 * subdimxFactor) / nrFloats;
-            fixedBw = true;
-        }
-        else {
-            dims[0].x = (32 * subdimxFactor);
-        }
-
-   		if (step->funcID == CLBLAS_GEMM2)  {
-			int count=0;
-
-            //
-			// NOTE:
-			// wgX and wgY setting for this function must be the same as
-			// CLBLAS_GEMM_TAIL below.
-			//
-			//vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype);
-            //
-            // PENDING: 16x16 works best on CYPRESS and 16x8 for Cayman
-            //
-			wgY = 8*subdimyFactor;
-			wgX = 8*subdimxFactor;
-			while((wgY * wgX) > maxWorkGroupSize)
-			{
-				if (count & 1)
-				{
-					wgY /= 2;
-					dims[0].y /= 2;
-			    } else {
-					wgX /= 2;
-					dims[0].x /= 2;
-			    }
-				count++;
-		    }
-		}
-
-   		if (step->funcID == CLBLAS_GEMM_TAIL)  {
-			//
-			// NOTE: wgY and wgX must be same as what is set for CLBLAS_GEMM2 above
-			//
-			vecLen = 1;
-
-				//
-				// PENDING: What if maxWorkGroupSize < 64 ????
-				//
-				wgY = 8;
-				wgX = 8;
-				dims[0].y = wgY ;
-				dims[0].x = wgX ;
-			}
-
-        if((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV))
-        {
-            wgY = 8;
-            wgX = 8;
-            dims[0].y = 64;
-            dims[0].x = 64;
-        }
-
-        dims[0].bwidth = 16 / bcoeff;
-    }
-
-    /*
-     * Prevent using more than 1/2 of LDS so as to have at least 2 work groups
-     * per compute unit
-     */
-    if (ldsSize && mempat->sops->isFitToLDS) {
-        ldsSize /= 2;
-
-        while (!mempat->sops->isFitToLDS(dims, dtype, ldsSize, &step->args)) {
-            /*
-             * decrease current component and setup this one to decrease
-             * on the next step; do not grow down block width below the
-             * value with which the block line takes size of a float4 vector
-             */
-            if (square) {
-                dims[0].x /= 2;
-                dims[0].y /= 2;
-                dims[0].bwidth /= 2;
-            }
-            else {
-                switch (component) {
-                case SDIM_X:
-                    dims[0].x /= 2;
-                    if (dims[0].bwidth * tsize == sizeof(cl_float4)) {
-                        component = SDIM_Y;
-                    }
-                    else {
-                        component = SDIM_BWIDTH;
-                    }
-                    break;
-                case SDIM_Y:
-                    dims[0].y /= 2;
-                    component = SDIM_X;
-                    break;
-                case SDIM_BWIDTH:
-                    dims[0].bwidth /= 2;
-                    component = SDIM_Y;
-                    break;
-                }
-            }
-        }
-
-        assert(dims[0].x > 0 && dims[0].y > 0 &&
-               dims[0].bwidth * tsize >= sizeof(cl_float4));
-    }
-
-    /*
-     * adjust local size if a subproblem is not divisible
-     * between all local threads
-     */
-    for (; (wgY > 1) && (dims[0].y < wgY); wgY /= 2) { }
-    for (; (wgX > 1) && (dims[0].x < wgX); wgX /= 2) { }
-
-    sflags = mempat->sops->getFlags();
-    if (sflags & SF_WSPACE_2D) {
-        pgran->wgDim = 2;
-        dims[0].itemY = dims[0].y;
-        pgran->wgSize[0] = (unsigned int)wgY;
-        pgran->wgSize[1] = (unsigned int)wgX;
-    }
-    else {
-        pgran->wgDim = 1;
-        pgran->wgSize[0] = (unsigned int)(wgX * wgY);
-        pgran->wgSize[1] = 1;
-    }
-
-    /*
-     * Divide the work between threads
-     */
-    dims[1].itemX = dims[0].x / wgX;
-    dims[1].itemY = dims[0].y / wgY;
-    dims[1].x = dims[1].itemX;
-    dims[1].y = dims[1].itemY;
-
-    if ((mempat->nrLevels == 1) && square) {
-        dims[1].bwidth = dims[1].y;
-    }
-    else {
-        i = fixedBw ? 4 : (8 / nrFloats);
-        dims[1].bwidth = szmin(i, dims[0].bwidth);
-    }
-
-    dims[0].itemX = dims[0].x;
-    dims[0].itemY = dims[0].y;
-
-    /*
-     * FIXME: Now, there are issues with generating kernels with non square
-     *        tiles in LDS less TRSM due to some fundamental restriction
-     *        of the core generator logic. Deprecate this kludge when
-     *        they will be eliminated
-     */
-#if 1
-    if ((step->funcID == CLBLAS_TRSM) && (step->patternID == 2)) {
-        dims[1].bwidth = dims[1].y;
-    }
-#endif
-    if (funcHasTriangMatrix(step->funcID) && (pgran->wgDim == 1)) {
-        dims[0].itemY = SUBDIM_UNUSED;
-        if (mempat->nrLevels == 1) {
-            dims[1].itemY = SUBDIM_UNUSED;
-        }
-    }
-
-    if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) {
-        dims[0].bwidth = dims[1].bwidth;
-    }
-    /*
-     * Ensure decomposition size for vectors in case
-     * of level 2 routines equal to 1.
-     */
-    if (funcBlasLevel(step->funcID) == 2) {
-        size_t xBlocks;
-
-        xBlocks = dims[0].x / dims[1].x;
-        dims[0].x = 1;
-        dims[1].itemX = 1;
-        dims[1].x = 1;
-        dims[0].bwidth = dims[1].bwidth * xBlocks;
-    }
-
-    // fixup work group size in respect with desired work dispatch order
-    if ((pgran->wgDim == 2) && mempat->sops->innerDecompositionAxis) {
-        if (mempat->sops->innerDecompositionAxis(&step->args) ==
-            DECOMP_AXIS_X) {
-
-            unsigned int u;
-
-            u = pgran->wgSize[0];
-            pgran->wgSize[0] = pgran->wgSize[1];
-            pgran->wgSize[1] = u;
-        }
-    }
-    //printf("GDSG: suby = %lu, subx = %lu, bwidth0=%lu, bwidth1=%lu\n", dims[0].y, dims[0].x, dims[0].bwidth, dims[1].bwidth);
-}
-
-static bool
-avoidLoadFromStorage(SolutionStep *step)
-{
-    bool notDiv;
-    MemoryPattern *mempat =
-            &clblasSolvers[step->funcID].memPatterns[step->patternID];
-    bool bothCached = isMatrixCached(mempat, MATRIX_A) &&
-                      isMatrixCached(mempat, MATRIX_B);
-
-    if (bothCached) {
-        return false;
-    }
-
-    if ((step->funcID == CLBLAS_GEMM2) && ((step->args.pigFuncID == CLBLAS_SYMM) || (step->args.pigFuncID == CLBLAS_HEMM)) )
-    {
-        // FIXME: Assuming that returning "true" will load defaultDecomposition sizes
-        //        But the statement below on TRSM is a bit confusing.
-        //        Returning FALSE  here will load from storage in getStepGranulation()
-        return true;
-    }
-
-    /*
-     * don't load from storage data for LDS gemm,
-     * not integrally divisible
-     */
-    notDiv = (step->args.M % 64) || (step->args.N % 64) || (step->args.K % 64);
-
-    return ((step->funcID == CLBLAS_GEMM) && notDiv);
-}
-
-static bool
-getStepResources(SolutionStep *step)
-{
-    int i = 0;
-    size_t tsize;
-    unsigned int vecLen;
-    size_t minWidth, minHeight, bestHeight, minSize, bestSize;
-    MatrixRole mrole;
-    cl_device_id devID;
-    cl_context ctx;
-    MemoryPattern *mempat;
-    SubproblemDim probDim;
-    CLBlasKargs *kargs = &step->args;
-    bool ret = true;
-
-    tsize = dtypeSize(kargs->dtype);
-    vecLen = (unsigned int)(sizeof(cl_float4) / tsize);
-    kargsToProbDims(&probDim, step->funcID, &step->args, false);
-    getQueueContext(step->cmdQueue, &ctx);
-    getQueueDevice(step->cmdQueue, &devID);
-
-    mempat = &(clblasSolvers[step->funcID].memPatterns[step->patternID]);
-
-    for (mrole = MATRIX_A, i = 0; mrole < MATRIX_C; mrole++) {
-        if (isMatrixInImage(mempat, mrole)) {
-            if (step->funcID == CLBLAS_TRSM) {
-                //blocks
-                unsigned int packRate;
-                clblasOrder packOrder;
-                size_t pitch;
-                size_t matrWidth, matrHeight;
-                CLBLASKernExtra extra;
-
-                memset(&extra, 0, sizeof(extra));
-                extra.dtype = kargs->dtype;
-                extra.flags = step->extraFlags;
-
-                mempat->sops->imgPackMode(&extra,
-                                          step->subdims, mrole,
-                                          &packRate, &packOrder);
-
-                // minimal size parameters
-                pitch = matrBlockPitch(step->subdims, mrole, kargs->dtype,
-                                        kargs->side);
-                matrWidth = matrBlockPitch(&probDim, mrole, kargs->dtype,
-                                           kargs->side);
-                matrHeight = matrBlockHeight(&probDim, mrole, kargs->side);
-
-                //One panel should fit to image
-                if (packOrder == clblasRowMajor) {
-                    minWidth = divRoundUp(matrWidth, pitch) * pitch / vecLen;
-                    minHeight = packRate;
-
-                    minSize = minWidth * minHeight;
-                    // size of image to store all blocks
-                    bestSize = minHeight * (minWidth + pitch / vecLen) *
-                               divRoundUp(matrHeight, packRate) / 2;
-                }
-                else {
-                    minWidth = pitch / vecLen;
-                    minHeight = divRoundUp(matrHeight, packRate) * packRate;
-
-                    minSize = minWidth * minHeight;
-                    bestSize = minWidth * (minHeight + packRate) *
-                               divRoundUp(matrWidth, pitch) / 2;
-                }
-                minSize = bestSize;
-            }
-            else {
-                //panels
-                getSuitableImageSizes(&minWidth, &minHeight, &bestHeight,
-                                      mrole, kargs, vecLen, step->subdims);
-                minSize = minWidth * minHeight;
-                bestSize = minWidth * bestHeight;
-            }
-
-            kargs->scimage[i] = getSCImage(ctx, devID, bestSize,
-                                           minSize, minWidth);
-            if (kargs->scimage[i] == NULL) {
-                ret = false;
-                break;
-            }
-
-            i++;
-        }
-    }
-
-    return ret;
-}
-
-static void
-getSuitableImageSizes(
-    size_t *minWidth,
-    size_t *minHeight,
-    size_t *bestHeight,
-    MatrixRole mrole,
-    CLBlasKargs *kargs,
-    unsigned int vecLen,
-    SubproblemDim *subdims)
-{
-    size_t alignedM, alignedN, alignedK;
-    alignedM = divRoundUp(kargs->M, subdims->y);
-    alignedM *= subdims->y;
-    alignedN = divRoundUp(kargs->N, subdims->x);
-    alignedN *= subdims->x;
-    alignedK = divRoundUp(kargs->K, subdims->bwidth);
-    alignedK *= subdims->bwidth;
-    switch (mrole) {
-        case MATRIX_A:
-            *minWidth = alignedK / vecLen;
-            *bestHeight = alignedM;
-            *minHeight = subdims->y;
-             break;
-        case MATRIX_B:
-            *minWidth = alignedK / vecLen;
-            *bestHeight = alignedN;
-            *minHeight = subdims->x;
-            break;
-        case MATRIX_C:
-            *minWidth = alignedN / vecLen;
-            *bestHeight = alignedM;
-            *minHeight = subdims->y;
-            break;
-        default:
-            break;
-    }
-}
-
-/*
- * TRxM -> TRxM + GEMM + TRxM
- *
- * When talking about matrix A splitting the following numbering is used:
- *
- *     +---+---+
- *     | 1 | 2 |
- *     +---+---+
- *     | 3 | 4 |
- *     +---+---+
- */
-static ListNode*
-decomposeTRXMStep(SolutionStep *step)
-{
-    CLBlasKargs *kargs = &(step->args);
-    SolutionStep *trxm1 = NULL, *gemm = NULL, *trxm2 = NULL, *tmp;
-    clblasUplo position;
-    SubproblemDim size, offset;
-    int swap;
-    cl_float f;
-    cl_double d;
-    clblasImplementation impl = clblasDefaultGemm;
-    size_t offsetK = 0;
-
-    // skip decomposition for a trmm case which works faster without it
-    if (step->funcID == CLBLAS_TRMM && !isDoubleBasedType(step->args.dtype) &&
-        isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) {
-        return &(step->node);
-    }
-
-    /* Implementation specific checks */
-
-    if ((getGemmPreferredPattern() != clblasDefaultGemm) &&
-        (getGemmPreferredPattern() != clblasBlockGemmWithCaching)) {
-
-        return &(step->node);
-    }
-    if (step->funcID == CLBLAS_TRMM) {
-        impl = getTrmmPreferredPattern();
-        if ((impl != clblasDefaultTrmm) &&
-            (impl != clblasBlockTrmmWithCaching)) {
-
-            return &(step->node);
-        }
-    }
-    else {
-        impl = getTrsmPreferredPattern();
-        if ((impl != clblasDefaultTrsm) &&
-            (impl != clblasBlockTrsmWithCaching) &&
-            (impl != clblasBlockTrsmWithoutLds)) {
-
-            return &(step->node);
-        }
-    }
-
-    if ((kargs->side == clblasLeft) &&
-        (kargs->M < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
-        return &(step->node);
-    }
-    if ((kargs->side == clblasRight) &&
-        (kargs->N < DECOMPOSITION_THRESHOLD(step->args.dtype))) {
-        return &(step->node);
-    }
-
-    trxm1 = calloc(1, sizeof(SolutionStep));
-    gemm = calloc(1, sizeof(SolutionStep));
-    trxm2 = calloc(1, sizeof(SolutionStep));
-    if ((trxm1 == NULL) || (gemm == NULL) || (trxm2 == NULL)) {
-        if (trxm1 != NULL) {
-            free(trxm1);
-        }
-        if (gemm != NULL) {
-            free(gemm);
-        }
-        if (trxm2 != NULL) {
-            free(trxm2);
-        }
-        return &(step->node);
-    }
-    memcpy(trxm1, step, sizeof(SolutionStep));
-    memcpy(gemm, step, sizeof(SolutionStep));
-    memcpy(trxm2, step, sizeof(SolutionStep));
-
-    gemm->funcID = CLBLAS_GEMM;
-    gemm->args.C = kargs->B;
-    gemm->args.ldc.matrix = kargs->ldb.matrix;
-    gemm->args.offCY = kargs->offBX;
-    switch (kargs->dtype) {
-    case TYPE_FLOAT:
-        if (step->funcID == CLBLAS_TRSM) {
-            if (gemm->args.alpha.argFloat != 0.0f) {
-                gemm->args.alpha.argFloat = -1 / gemm->args.alpha.argFloat;
-            }
-        }
-        gemm->args.beta.argFloat = 1.0f;
-        break;
-    case TYPE_DOUBLE:
-        if (step->funcID == CLBLAS_TRSM) {
-            if (gemm->args.alpha.argDouble != 0.0f) {
-                gemm->args.alpha.argDouble = -1 / gemm->args.alpha.argDouble;
-            }
-        }
-        gemm->args.beta.argDouble = 1.0f;
-        break;
-    case TYPE_COMPLEX_FLOAT:
-        if (step->funcID == CLBLAS_TRSM) {
-            f = CREAL(gemm->args.alpha.argFloatComplex) *
-                CREAL(gemm->args.alpha.argFloatComplex) +
-                CIMAG(gemm->args.alpha.argFloatComplex) *
-                CIMAG(gemm->args.alpha.argFloatComplex);
-            if (f != 0.0f) {
-                gemm->args.alpha.argFloatComplex = floatComplex(
-                    -CREAL(gemm->args.alpha.argFloatComplex) / f,
-                     CIMAG(gemm->args.alpha.argFloatComplex) / f);
-            }
-        }
-        gemm->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
-        break;
-    case TYPE_COMPLEX_DOUBLE:
-        if (step->funcID == CLBLAS_TRSM) {
-            d = CREAL(gemm->args.alpha.argDoubleComplex) *
-                CREAL(gemm->args.alpha.argDoubleComplex) +
-                CIMAG(gemm->args.alpha.argDoubleComplex) *
-                CIMAG(gemm->args.alpha.argDoubleComplex);
-            if (d != 0.0f) {
-                gemm->args.alpha.argDoubleComplex = doubleComplex(
-                    -CREAL(gemm->args.alpha.argDoubleComplex) / d,
-                     CIMAG(gemm->args.alpha.argDoubleComplex) / d);
-            }
-        }
-        gemm->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
-        break;
-    }
-
-    /* Actual position of matrix A's data to use */
-    if (kargs->transA == clblasNoTrans) {
-        position = kargs->uplo;
-    }
-    else {
-        position = (kargs->uplo == clblasUpper) ? clblasLower :
-                        clblasUpper;
-    }
-
-    /* Map trxm1 to A1 */
-    kargsToProbDims(&size, trxm1->funcID, &(trxm1->args), false);
-    size.y = align(size.y / 2, DIVISION_ALIGNMENT);
-    probDimsToKargs(&(trxm1->args), trxm1->funcID, &size, false);
-
-    /* Map trxm2 to A4 */
-    kargsToProbDims(&offset, trxm2->funcID, &(trxm2->args), true);
-    kargsToProbDims(&size, trxm2->funcID, &(trxm2->args), false);
-    offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
-    size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
-    probDimsToKargs(&(trxm2->args), trxm2->funcID, &offset, true);
-    probDimsToKargs(&(trxm2->args), trxm2->funcID, &size, false);
-
-
-    if (kargs->side == clblasLeft) {
-        trxm1->args.K = trxm1->args.M;
-        trxm2->args.K = trxm2->args.M;
-
-        gemm->args.transB = clblasNoTrans;
-
-        if (position == clblasUpper) {
-            /* Map gemm to A2 */
-            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
-            size.y = align(size.y / 2, DIVISION_ALIGNMENT);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
-            offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-            gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-        }
-        else {
-            /* Map gemm to A3 */
-            kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
-            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
-            offset.y += align(size.y / 2, DIVISION_ALIGNMENT);
-            size.y -= align(size.y / 2, DIVISION_ALIGNMENT);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
-            gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-        }
-    }
-    else {
-        trxm1->args.K = trxm1->args.N;
-        trxm2->args.K = trxm2->args.N;
-
-        gemm->args.transA = clblasNoTrans;
-        gemm->args.A = kargs->B;
-        gemm->args.lda.matrix = kargs->ldb.matrix;
-        gemm->args.offA = kargs->offBX;
-        gemm->args.transB = kargs->transA;
-        gemm->args.B = kargs->A;
-        gemm->args.ldb.matrix = kargs->lda.matrix;
-        gemm->args.offBX = kargs->offA;
-
-        if (position == clblasUpper) {
-            /* Map gemm to A2 */
-            kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true);
-            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
-            offset.x += align(size.x / 2, DIVISION_ALIGNMENT);
-            size.x -= align(size.x / 2, DIVISION_ALIGNMENT);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
-            gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-        }
-        else {
-            /* Map gemm to A3 */
-            kargsToProbDims(&size, gemm->funcID, &(gemm->args), false);
-            size.x = align(size.x / 2, DIVISION_ALIGNMENT);
-            probDimsToKargs(&(gemm->args), gemm->funcID, &size, false);
-            offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-            gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT);
-        }
-    }
-
-    trxm1->extraFlags = clblasArgsToKextraFlags(&(trxm1->args), trxm1->funcID);
-    gemm->extraFlags = clblasArgsToKextraFlags(&(gemm->args), gemm->funcID);
-    trxm2->extraFlags = clblasArgsToKextraFlags(&(trxm2->args), trxm2->funcID);
-
-    fixupGemmOffsets(&gemm->args, gemm->extraFlags, offsetK);
-
-    /* Swap trxm1 and trxm2 if needed. */
-
-    swap = 0;
-    if (kargs->side == clblasLeft) {
-        if ((step->funcID == CLBLAS_TRMM) && (position == clblasLower)) {
-            swap = 1;
-        }
-        if ((step->funcID == CLBLAS_TRSM) && (position == clblasUpper)) {
-            swap = 1;
-        }
-    }
-    else {
-        if ((step->funcID == CLBLAS_TRMM) && (position == clblasUpper)) {
-            swap = 1;
-        }
-        if ((step->funcID == CLBLAS_TRSM) && (position == clblasLower)) {
-            swap = 1;
-        }
-    }
-    if (swap) {
-        tmp = trxm1;
-        trxm1 = trxm2;
-        trxm2 = tmp;
-    }
-    /* Tie the sequence trmm1 - gemm - trmm2 together. */
-
-    trxm1->event = decomposeEventsAlloc();
-    trxm1->node.next = &(gemm->node);
-
-    gemm->numEventsInWaitList = 1;
-    gemm->eventWaitList = trxm1->event;
-    gemm->event = decomposeEventsAlloc();
-    gemm->node.prev = &(trxm1->node);
-    gemm->node.next = &(trxm2->node);
-
-    trxm2->numEventsInWaitList = 1;
-    trxm2->eventWaitList = gemm->event;
-    trxm2->node.prev = &(gemm->node);
-
-    /* Insert new sequence instead of current step */
-
-    trxm1->node.prev = step->node.prev;
-    (trxm1->node.prev)->next = &(trxm1->node);
-    step->node.prev = NULL;
-
-    trxm2->node.next = step->node.next;
-    (trxm2->node.next)->prev = &(trxm2->node);
-    step->node.next = NULL;
-
-    freeSolutionStep(&(step->node));
-
-    return &(trxm2->node);
-}
-
-/*
- *  Decompose a SYRK problem in order to evaluate the diagonal part
- *  separately. It's useful since the compiler allocates huge number
- *  of registers for a code processing the diagonal.
- */
-static ListNode*
-decomposeSYRKStep(SolutionStep *step)
-{
-    CLBlasKargs *kargs = &step->args;
-    SolutionStep *syrk2 = NULL;
-    size_t thresh;
-    ListNode *next;
-
-    /*
-     * Tail prediction. Believe that tile sizes will not exceed 8.
-     * Disable decomposition if there are not subproblem tails at
-     * the tile level because it can likely slowdown since diagonal
-     * update is optimized. Actual tail detection is done after
-     * the math decomposition. So the kludge is forced.
-     */
-    if ((kargs->M % 8 == 0) && (kargs->N % 8 == 0)) {
-        return &(step->node);
-    }
-
-    thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
-    if (kargs->M < thresh / 2) {
-        return &(step->node);
-    }
-
-    syrk2 = malloc(sizeof(SolutionStep));
-    if (syrk2 == NULL) {
-        return &(step->node);
-    }
-
-    step->extraFlags |= KEXTRA_SYRK_SEPARATE_DIAGONAL;
-    memcpy(syrk2, step, sizeof(SolutionStep));
-    syrk2->extraFlags |= KEXTRA_SYRK_EVALUATE_DIAGONAL;
-
-    next = step->node.next;
-
-    /* Synchronize the steps */
-
-    /*
-     * This is to not disturb synchronization between the current and the next
-     * step or to put the output user event to the tail of the chain if syrk2
-     * is the last step
-     */
-    syrk2->event = step->event;
-    step->event = decomposeEventsAlloc();
-    syrk2->numEventsInWaitList = 1;
-    syrk2->eventWaitList = step->event;
-
-    /* Insert the additional step to the list */
-    step->node.next = &syrk2->node;
-    syrk2->node.prev = &step->node;
-    syrk2->node.next = next;
-    next->prev = &syrk2->node;
-
-    return &(syrk2->node);
-}
-
-static ListNode*
-decomposeSYR2KStep(SolutionStep *step)
-{
-    CLBlasKargs *kargs = &(step->args);
-    SolutionStep *syrk1 = NULL, *syrk2 = NULL;
-    size_t thresh;
-    ListNode *node;
-
-    /* SYR2K implementation is done as blocked with cache-usage optimization
-     * only. Therefore, no implementation specific checks.
-     */
-
-    thresh = DECOMPOSITION_THRESHOLD(step->args.dtype);
-    if (kargs->M < thresh / 2) {
-        return &(step->node);
-    }
-
-    syrk1 = calloc(1, sizeof(SolutionStep));
-    syrk2 = calloc(1, sizeof(SolutionStep));
-    if ((syrk1 == NULL) || (syrk2 == NULL)) {
-        if (syrk1 != NULL) {
-            free(syrk1);
-        }
-        if (syrk2 != NULL) {
-            free(syrk2);
-        }
-        return &(step->node);
-    }
-    memcpy(syrk1, step, sizeof(SolutionStep));
-    memcpy(syrk2, step, sizeof(SolutionStep));
-
-    syrk2->args.A = kargs->B;
-    syrk2->args.lda.matrix = kargs->ldb.matrix;
-    syrk2->args.offA = kargs->offBX;
-    syrk2->args.B = kargs->A;
-    syrk2->args.ldb.matrix = kargs->lda.matrix;
-    syrk2->args.offBX = kargs->offA;
-    switch (kargs->dtype) {
-    case TYPE_FLOAT:
-        syrk2->args.beta.argFloat = 1.0f;
-        break;
-    case TYPE_DOUBLE:
-        syrk2->args.beta.argDouble = 1.0f;
-        break;
-    case TYPE_COMPLEX_FLOAT:
-        syrk2->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f);
-        break;
-    case TYPE_COMPLEX_DOUBLE:
-        syrk2->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f);
-        break;
-    }
-
-    syrk1->extraFlags = clblasArgsToKextraFlags(&(syrk1->args), syrk1->funcID);
-    syrk1->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
-    syrk2->extraFlags = clblasArgsToKextraFlags(&(syrk2->args), syrk2->funcID);
-    syrk2->extraFlags &= ~KEXTRA_SYRK_2K_RANK;
-
-    /* Tie the sequence syrk1 - syrk2 together. */
-
-    syrk1->event = decomposeEventsAlloc();
-    syrk1->node.next = &(syrk2->node);
-
-    syrk2->numEventsInWaitList = 1;
-    syrk2->eventWaitList = syrk1->event;
-    syrk2->node.prev = &(syrk1->node);
-
-    /* Insert new sequence instead of current step */
-
-    syrk1->node.prev = step->node.prev;
-    (syrk1->node.prev)->next = &(syrk1->node);
-    step->node.prev = NULL;
-
-    syrk2->node.next = step->node.next;
-    (syrk2->node.next)->prev = &(syrk2->node);
-    step->node.next = NULL;
-
-    freeSolutionStep(&(step->node));
-
-    /*
-     * Now, decompose each of these steps to evaluate the diagonal
-     * part in a dedicated kernel
-     */
-    decomposeSYRKStep(syrk1);
-    node = decomposeSYRKStep(syrk2);
-    return node;
-}