summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/common/gens/dblock_kgen.c
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/common/gens/dblock_kgen.c')
-rw-r--r--external/clBLAS/src/library/common/gens/dblock_kgen.c1497
1 files changed, 0 insertions, 1497 deletions
diff --git a/external/clBLAS/src/library/common/gens/dblock_kgen.c b/external/clBLAS/src/library/common/gens/dblock_kgen.c
deleted file mode 100644
index b30b3919..00000000
--- a/external/clBLAS/src/library/common/gens/dblock_kgen.c
+++ /dev/null
@@ -1,1497 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-#include <string.h>
-#include <stdio.h>
-
-#include <dis_warning.h>
-#include <dblock_kgen.h>
-
-/*
- * TODO:
- * 1) barriers in the case when work group size is greater than the
- * wavefront size
- * 2) 2D dimensional work group size
- * 3) Try version with array indexing
- * 4) Option to avoid unaligned access to vector data (?)
- */
-
-// work performed by work items
-typedef struct ItemWork {
- // number of rows to be processed by single work item
- size_t nrRows;
- // number of columns to be processed by single work item
- size_t nrCols;
- // number of items processing the same row
- unsigned int itemsPerRow;
- // total number of items performing the work
- unsigned int nrItems;
- // reduced number of rows at the block tail
- size_t blockTail;
- // work size to be done with the row tail non packed in float4
- size_t tail;
-} ItemWork;
-
-/*
- * Private data for loop unrolling
- *
- * NOTE: lmemLD is not used if both
- * 'locLDName' is initialized
- */
-typedef struct GenPriv {
- DBlockCopyDirection dir;
- bool transp;
- bool packed;
- bool conjugate;
- bool notVectorize;
- // local memory block leading dimension
- size_t lmemLD;
- // local memory leading dimension variable name
- const char *locLDName;
- // global memory leading dimension variable name
- const char *globLDName;
- DataType dtype;
- unsigned int nfloats;
- unsigned int typeSize;
- const SubproblemDim *dim;
- const ItemWork *work;
- const char *srcName;
- const char *dstName;
- // variables names used while copying to images
- const char *imgXName;
- const char *imgYName;
- size_t cnt;
- // The block size used for copying.
- // The default is 4.
- unsigned int vecLen;
-
-} GenPriv;
-
-
-/*
- * 'ld' in the list of arguments is matrix leading dimension
- *
- * Common name forming rule:
- * (type prefix)(generic part)['Transp']['Conj']['Nvec'](src mem][dst mem][block height][block width]
- */
-const char *copyMemDBlockDecl =
- "void\n"
- "%ccopyDBlock%s%s%s%c%c%lu%lu(\n"
- " %cPtr dst,\n"
- " %cPtr src,\n"
- " uint startRow,\n"
- " uint startCol,\n"
- " uint ld)\n";
-
-const char *copyMemGImgDBlockDecl =
- "void\n"
- "%ccopyDBlock%sGI%lux%lu(\n"
- " __write_only image2d_t dst,\n"
- " int startX,\n"
- " int startY,\n"
- " GPtr src,\n"
- " uint startRow,\n"
- " uint startCol,\n"
- " uint ld)\n";
-
-const char *copyMemLImgDBlockDecl =
- "void\n"
- "%ccopyDBlock%sLI%lux%lu(\n"
- " __write_only image2d_t dst,\n"
- " int startX,\n"
- " int startY,\n"
- " LPtr src)\n";
-
-/*
- * declaration for function performing slow data block copying
- */
-const char *copyMemDBlockSlowDecl =
- "void\n"
- "%ccopyDBlock%s%s%s%c%c(\n"
- " %cPtr dst,\n"
- " %cPtr src,\n"
- " uint startRow,\n"
- " uint startCol,\n"
- " uint nrRows,\n"
- " uint nrCols,\n"
- " uint dstLD,\n"
- " uint srcLD)\n";
-
-/*
- * declaration for function performing slow data to image block copying
- */
-const char *copyMemGImgDBlockSlowDecl =
- "void\n"
- "%ccopyDBlock%sGI(\n"
- " __write_only image2d_t dst,\n"
- " int startX,\n"
- " int startY,\n"
- " GPtr src,\n"
- " uint startRow,\n"
- " uint startCol,\n"
- " uint nrRows,\n"
- " uint nrCols,\n"
- " uint srcLD)\n";
-
-const char *copyMemLImgDBlockSlowDecl =
- "void\n"
- "%ccopyDBlock%sLI(\n"
- " __write_only image2d_t dst,\n"
- " int startX,\n"
- " int startY,\n"
- " LPtr src,\n"
- " uint nrRows,\n"
- " uint nrCols,\n"
- " uint srcLD)\n";
-
-/*
- * local variables for slow copying between the global and
- * the local memory
- */
-
-const char *copyMemSlowLvars =
- "uint i, j, n;\n"
- /*
- * end counters for copying with vector blocks and just vectors
- * depending in copying type and direction
- */
- "uint jb, jv;\n"
- // end counter for coying with single data with size lesser than float4
- "%s"
- // temporaty float4 variable for the transposing version
- "%s"
- "%cPtr dst1;\n"
- "%cPtr src1;\n\n";
-
-/*
- * One version use passing over the rows, and the second one use
- * passing over the columns. The Second variant is used for transposed
- * copying from the local to the global memory.
- */
-const char *copyMemDBlockSlowStart[2] = {
- "if (nrRows %% lsize) {\n"
- " n = nrRows / lsize + 1;\n"
- "}\n"
- "else {\n"
- " n = nrRows / lsize;\n"
- "}\n"
- "\n"
- "jb = nrCols / %u;\n"
- "jv = (nrCols - jb * %u) / %u;\n"
- // set counter end for copying with data which size is lesser than float4
- "%s"
- // set pointers to initial position
- "%s"
- "%s"
- "n = (n * lid >= nrRows) ? 0 : n;\n"
- "n = (n * lid + n > nrRows) ? (n - 1) : n;\n"
- "\n",
-
- "if (nrCols %% lsize) {\n"
- " n = nrCols / lsize + 1;\n"
- "}\n"
- "else {\n"
- " n = nrCols / lsize;\n"
- "}\n"
- "\n"
- // set counters for vector copying
- "jb = nrRows / %u;\n"
- "jv = (nrRows - jb * %u) / %u;\n"
- // set counter end for copying with data which size is lesser than float4
- "%s"
- // set pointers to initial position
- "%s"
- "%s"
- "n = (n * lid >= nrCols) ? 0 : n;\n"
- "n = (n * lid + n > nrCols) ? (n - 1) : n;\n"
- "\n"
-};
-
-/*
- * declaration for function zeroing float4 aligned
- * block of data
- */
-const char *f4zeroDecl =
- "void\n"
- "%cf4zero%lu(%s float4 *data)\n";
-
-const char *fzeroSlowDecl = "void\n"
- "%cf4zero(%s float4 *buf, size_t cnt)\n";
-
-const char *copyMemImgDBlockSlow =
- "for (i = 0; i < n; i++) {\n"
- " int x1 = x;\n"
- " int y1 = y;\n"
- " %cPtr src1 = src;\n"
- "\n"
- " for (j = 0; j < jb; j++) {\n"
- " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
- " }\n"
- " for (j = 0; j < jv; j++) {\n"
- " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n"
- " }\n"
- "\n"
- " y++;\n"
- " src.%s += srcLD;\n"
- "}\n";
-
-
-const char *copyMemImgDBlockPackedSlow =
- "for (i = 0; i < n; i++) {\n"
- " %cPtr src1 = src;\n"
- " x = startX + ((index + i) %% nLines) * nrCols / %lu;\n"
- " y = startY + (index + i) / nLines;\n"
- "\n"
- " for (j = 0; j < jb; j++) {\n"
- " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
- " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
- " }\n"
- " for (j = 0; j < jv; j++) {\n"
- " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n"
- " }\n"
- "\n"
- " src.%s += srcLD;\n"
- "}\n";
-
-const char *setLoopBoundStmt =
- "if (lid > %u) {\n"
- " nrows = 0;\n"
- "}\n"
- "else {\n"
- " nrows = (lid == %u) ? %u : %u;\n"
- "}\n";
-
-const char *privatePtrs =
- "%cPtr src1;\n"
- "%cPtr dst1;\n";
-
-// loop bound variable name
-const char *lboundVarName = "nrows";
-// local id variable
-const char *lidVarName = "lid";
-
-
-/*
- * Partial initialization of the generator private information
- */
-static void
-initGenPriv(
- GenPriv *priv,
- DataType dtype,
- unsigned int typeSize,
- const SubproblemDim *dim,
- DBlockCopyDirection dir,
- const ItemWork *work,
- const PGranularity *pgran)
-{
- unsigned int gsize;
-
- priv->dtype = dtype;
- priv->typeSize = typeSize;
- priv->nfloats = typeSize / sizeof(float);
- priv->dim = dim;
- priv->dir = dir;
- priv->work = work;
- priv->cnt = 0;
- priv->vecLen = FLOAT4_VECLEN;
- if (dir == DBLOCK_GLOBAL_TO_LOCAL || dir == DBLOCK_LOCAL_TO_GLOBAL) {
- gsize = pgran->wgSize[0] * pgran->wgSize[1];
- priv->vecLen = (unsigned int)(dim->x * dim->y * priv->nfloats / gsize);
-
- if (priv->vecLen < 1) {
- priv->vecLen = 1;
- } else if (priv->vecLen > 4) {
- priv->vecLen = FLOAT4_VECLEN;
- }
- }
-
-}
-
-/*
- * get info about work to be done by the work group
- *
- * Resulting work data chunk for each item is float4 aligned.
- * Remaining data chunk presented as tail for which code is
- * generated just after the loop part getting deal with float4
- * aligned chunks.
- */
-static void
-getItemWork(ItemWork *work, const SubproblemDim *dim,
- const PGranularity *pgran, size_t nfloats,
- unsigned int vecLen)
-{
- size_t n;
- size_t gsize;
-
- memset(work, 0, sizeof(ItemWork));
- gsize = pgran->wgSize[0] * pgran->wgSize[1];
-
- if (dim->y < gsize) {
- // one work item processes a part of a row (or none at all)
- work->itemsPerRow = (unsigned int)(gsize / dim->y);
- work->nrCols = dim->x / work->itemsPerRow;
- work->nrRows = 1;
- if (work->itemsPerRow * dim->y < gsize) {
- work->nrItems = (unsigned int)(work->itemsPerRow * dim->y);
- }
- }
- else {
- // one work item processes typically several rows (or none at all)
- work->itemsPerRow = 1;
- work->nrCols = dim->x;
- work->nrRows = dim->y / gsize;
- if (dim->y % gsize) {
- work->nrRows++;
- work->nrItems = (unsigned int)(dim->y / work->nrRows);
- // remaining number of rows
- n = dim->y - work->nrItems * work->nrRows;
- if (n) {
- work->blockTail = n;
- // total number of work items needed for the transfer
- work->nrItems++;
- }
- }
- }
- work->nrCols -= (work->nrCols * nfloats % vecLen) / nfloats;
- work->tail = dim->x - work->nrCols * work->itemsPerRow;
-}
-
-/*
- * Prepare generator outer loop
- */
-static void
-prepareLoop(struct KgenContext *ctx, ItemWork *work, LoopCtl *loopCtl)
-{
- char tmp[1024];
-
- kgenAddStmt(ctx, "size_t n;\n");
- loopCtl->ocName = "n";
-
- if (work->nrItems) {
- sprintf(tmp, "size_t %s;\n\n", lboundVarName);
- kgenAddStmt(ctx, tmp);
-
- /*
- * set number of rows to be processed by the work item;
- * in the case it is not a constant
- */
- if (work->blockTail) {
- sprintf(tmp, setLoopBoundStmt, work->nrItems - 1, work->nrItems - 1,
- work->blockTail, work->nrRows);
- kgenAddStmt(ctx, tmp);
- }
- else {
- sprintf(tmp, "nrows = (%s >= %u) ? 0 : %lu;\n", lidVarName,
- work->nrItems, work->nrRows);
- kgenAddStmt(ctx, tmp);
- }
-
- loopCtl->outBound.name = lboundVarName;
- }
- else {
- loopCtl->outBound.val = (unsigned long)work->nrRows;
- loopCtl->obConst = true;
- }
-}
-
-static int
-getVecLen(struct KgenContext *ctx, void *priv)
-{
- GenPriv *gpriv = (GenPriv*)priv;
- (void) ctx;
- return gpriv->vecLen;
-}
-
-/*
- * common function for loop tail generating
- */
-static void
-addTailCode(
- struct KgenContext *ctx,
- GenPriv *gpriv,
- LoopUnrollGen genSingleVec,
- LoopUnrollGen genSingle)
-{
- char tmp[1024];
- const ItemWork *work = gpriv->work;
- LoopCtl loopCtl;
- LoopUnrollers unrollers;
-
- memset(&loopCtl, 0, sizeof(loopCtl));
- memset(&unrollers, 0, sizeof(unrollers));
-
- loopCtl.inBound = (unsigned long)work->tail;
-
- if (work->itemsPerRow > 1) {
- if (work->nrItems) {
- sprintf(tmp, "if ((%s %% %u == %u) && (%s < %u))",
- lidVarName, work->itemsPerRow, work->itemsPerRow - 1,
- lidVarName, work->nrItems);
- }
- else {
- sprintf(tmp, "if (%s %% %u == %u)",
- lidVarName, work->itemsPerRow, work->itemsPerRow - 1);
- }
- kgenBeginBranch(ctx, tmp);
- }
-
- unrollers.genSingleVec = genSingleVec;
- unrollers.genSingle = genSingle;
- unrollers.getVecLen = getVecLen;
-
- kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv);
-
- if (work->itemsPerRow > 1) {
- kgenEndBranch(ctx, NULL);
- }
-}
-
-static int
-copyMemPreUnroll(struct KgenContext *ctx, void *priv)
-{
- DUMMY_ARG_USAGE(priv);
-
- kgenAddStmt(ctx, "src1 = src;\n");
-
- return kgenAddStmt(ctx, "dst1 = dst;\n\n");
-}
-
-static int
-copyImgPreUnroll(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- GenPriv *gpriv = (GenPriv*)priv;
- if (gpriv->packed) {
- sprintf(tmp, "%s = startX + (index * %lu) %% pLine / %u;\n"
- "%s = startY + (index * %lu) / pLine;\n" "%s = src;\n\n",
- gpriv->imgXName, gpriv->dim->x, FLOAT4_VECLEN / gpriv->nfloats,
- gpriv->imgYName, gpriv->dim->x, gpriv->srcName);
- }
- else {
- sprintf(tmp, "%s = x;\n" "%s = y;\n" "%s = src;\n\n", gpriv->imgXName,
- gpriv->imgYName, gpriv->srcName);
- }
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyImgVec(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- GenPriv *gpriv = (GenPriv*)priv;
-
- dtypeUPtrField(gpriv->dtype);
- sprintf(tmp, "write_imageui(%s, (int2)(%s++,%s), as_uint4(*%s.f4v++));\n",
- gpriv->dstName, gpriv->imgXName, gpriv->imgYName, gpriv->srcName);
-
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyImgSingle(struct KgenContext *ctx, void *priv)
-{
- GenPriv *gpriv = (GenPriv*)priv;
- if (gpriv->dtype == TYPE_COMPLEX_DOUBLE) {
- return copyImgVec(ctx, priv);
- }
- else {
- return -EINVAL;
- }
-}
-
-static int
-copyMemVec(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- char vec[64];
- GenPriv *gpriv = (GenPriv*)priv;
-
- if (gpriv->vecLen == 1)
- sprintf(vec,"f");
- else
- sprintf(vec,"f%dv", gpriv->vecLen);
-
- if (gpriv->conjugate) {
- sprintf(tmp, "tmp = *%s.%s++;\n", gpriv->srcName, vec);
- kgenAddStmt(ctx, tmp);
- if (gpriv->dtype == TYPE_COMPLEX_FLOAT) {
- kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"
- "tmp.w = -tmp.w;\n");
- }
- else {
- kgenAddStmt(ctx, "tmp.y = -tmp.y;\n");
- }
- sprintf(tmp, "*%s.%s++ = tmp;\n",
- gpriv->dstName, vec);
- }
- else {
- sprintf(tmp, "*%s.%s++ = *%s.%s++;\n", gpriv->dstName, vec,
- gpriv->srcName, vec);
- }
-
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyMemSingle(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- GenPriv *gpriv = (GenPriv*)priv;
- const char *vfield;
-
- vfield = dtypeUPtrField(gpriv->dtype);
-
- if (gpriv->conjugate) {
- sprintf(tmp, "*%s.%s = *%s.%s++;\n",
- gpriv->dstName, vfield, gpriv->srcName, vfield);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n",
- gpriv->dstName, vfield, gpriv->dstName, vfield);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "%s.%s++;\n", gpriv->dstName, vfield);
- }
- else {
- sprintf(tmp, "*%s.%s++ = *%s.%s++;\n",
- gpriv->dstName, vfield, gpriv->srcName, vfield);
- }
-
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyMemVecTransp(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- size_t i;
- GenPriv *gpriv = (GenPriv*)priv;
- unsigned int n = gpriv->nfloats;
- const char *tmpSuff[2][4] = {
- {"x", "y", "z", "w"},
- {"xy", "zw", NULL, NULL}};
- const char *dstSuff[4] = {"f", "f2v", NULL, "f4v"};
- const char *vfield;
- const char *s;
-
- vfield = dtypeUPtrField(gpriv->dtype);
- kgenAddBlankLine(ctx);
-
- if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
- sprintf(tmp, "tmp = *%s.f4v++;\n", gpriv->srcName);
- kgenAddStmt(ctx, tmp);
-
- if (gpriv->conjugate) {
- /*
- * Only complex float element can be conjugated here,
- * those of double complex type are processed with no vectrized
- * function
- */
- kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"
- "tmp.w = -tmp.w;\n");
- }
-
- for (i = 0; i < FLOAT4_VECLEN / n; i++) {
- if (gpriv->locLDName) {
- sprintf(tmp, "%s.%s[%s * %lu] = tmp.%s;\n",
- gpriv->dstName, dstSuff[n - 1],
- gpriv->locLDName, i, tmpSuff[n - 1][i]);
- }
- else {
- sprintf(tmp, "%s.%s[%lu] = tmp.%s;\n", gpriv->dstName,
- dstSuff[n - 1], gpriv->lmemLD * i, tmpSuff[n - 1][i]);
- }
- kgenAddStmt(ctx, tmp);
- }
- s = gpriv->dstName;
- }
- else {
- for (i = 0; i < FLOAT4_VECLEN / n; i++) {
- if (gpriv->locLDName) {
- sprintf(tmp, "tmp.%s = %s.%s[%s * %lu];\n", tmpSuff[n - 1][i],
- gpriv->srcName, dstSuff[n - 1], gpriv->locLDName, i);
- }
- else {
- sprintf(tmp, "tmp.%s = %s.%s[%lu];\n", tmpSuff[n - 1][i],
- gpriv->srcName, dstSuff[n - 1], gpriv->lmemLD * i);
- }
- kgenAddStmt(ctx, tmp);
- }
-
- sprintf(tmp, "*%s.f4v++ = tmp;\n", gpriv->dstName);
- kgenAddStmt(ctx, tmp);
-
- s = gpriv->srcName;
- }
-
- if (gpriv->locLDName) {
- sprintf(tmp, "%s.%s += %s * %lu;\n", s, vfield, gpriv->locLDName, i);
- }
- else {
- sprintf(tmp, "%s.%s += %lu;\n", s, vfield, gpriv->lmemLD * i);
- }
-
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyMemSingleTransp(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- GenPriv *gpriv = (GenPriv*)priv;
- const char *vfield;
-
- vfield = dtypeUPtrField(gpriv->dtype);
- kgenAddBlankLine(ctx);
-
- if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
- if (gpriv->locLDName) {
- sprintf(tmp, "*%s.%s = *%s.%s++;\n",
- gpriv->dstName, vfield,
- gpriv->srcName, vfield);
- kgenAddStmt(ctx, tmp);
-
- if (gpriv->conjugate) {
- sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n",
- gpriv->dstName, vfield, gpriv->dstName,
- vfield);
- kgenAddStmt(ctx, tmp);
- }
- sprintf(tmp, "%s.%s += %s;\n",
- gpriv->dstName, vfield, gpriv->locLDName);
- }
- else {
- sprintf(tmp, "%s.%s[%lu] = *%s.%s++;\n",
- gpriv->dstName, vfield,
- gpriv->lmemLD * gpriv->cnt, gpriv->srcName,
- vfield);
- if (gpriv->conjugate) {
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "%s.%s[%lu].y = -%s.%s[%lu].y;\n",
- gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt,
- gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt);
- }
- }
- }
- else {
- if (gpriv->locLDName) {
- sprintf(tmp, "*%s.%s++ = *%s.%s;\n"
- "%s.%s += %s;\n",
- gpriv->dstName, vfield,
- gpriv->srcName, vfield,
- gpriv->srcName, vfield, gpriv->locLDName);
- }
- else {
- sprintf(tmp, "*%s.%s++ = %s.%s[%lu];\n",
- gpriv->dstName, vfield, gpriv->srcName, vfield,
- gpriv->lmemLD * gpriv->cnt);
- }
- }
- gpriv->cnt++;
-
- return kgenAddStmt(ctx, tmp);
-}
-
-/*
- * transfer row tail elements being not packing in float4 vector
- * and zeroing row tail
- */
-static void
-addCopyTailCode(struct KgenContext *ctx, GenPriv *gpriv)
-{
- LoopUnrollGen singleVec;
- LoopUnrollGen single;
- bool image;
-
- image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
- gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
-
- if (image) {
- singleVec = copyImgVec;
- single = copyImgSingle;
- }
- else {
- if (gpriv->transp) {
- singleVec = copyMemVecTransp;
- single = copyMemSingleTransp;
- }
- else {
- singleVec = copyMemVec;
- single = copyMemSingle;
- }
- }
-
- if (gpriv->notVectorize) {
- singleVec = NULL;
- }
- addTailCode(ctx, gpriv, singleVec, single);
-}
-
-static int
-copyMemPostUnroll(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- const char *s[2] = {"src", "dst"};
- GenPriv *gpriv = (GenPriv*)priv;
- int gdir;
- const char *vfield;
-
- gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
-
- if (gpriv->work && gpriv->work->tail) {
- addCopyTailCode(ctx, gpriv);
- }
-
- if (!gpriv->transp) {
- kgenAddBlankLine(ctx);
- }
-
- // modify pointers
- vfield = dtypeUPtrField(gpriv->dtype);
- sprintf(tmp, "%s.%s += %s;\n", s[gdir], vfield, gpriv->globLDName);
- kgenAddStmt(ctx, tmp);
-
- if (gpriv->transp) {
- sprintf(tmp, "%s.%s++;\n", s[1 - gdir], vfield);
- }
- else {
- if (gpriv->locLDName) {
- sprintf(tmp, "%s.%s += %s;\n", s[1 - gdir],
- vfield, gpriv->locLDName);
- }
- else {
- sprintf(tmp, "%s.%s += %lu;\n", s[1 - gdir],
- vfield, gpriv->lmemLD);
- }
- }
-
- return kgenAddStmt(ctx, tmp);
-}
-
-static int
-copyImgPostUnroll(struct KgenContext *ctx, void *priv)
-{
- char tmp[1024];
- GenPriv *gpriv = (GenPriv*)priv;
- const char *vfield = dtypeUPtrField(gpriv->dtype);
-
- if (gpriv->work && gpriv->work->tail) {
- addCopyTailCode(ctx, gpriv);
- }
-
- kgenAddBlankLine(ctx);
-
- if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
- sprintf(tmp, "src.%s += %s;\n", vfield, gpriv->globLDName);
- }
- else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
- sprintf(tmp, "src.%s += %lu;\n", vfield, gpriv->lmemLD);
- }
- kgenAddStmt(ctx, tmp);
- if(gpriv->packed) {
- sprintf(tmp, "index++;\n");
- } else {
- sprintf(tmp, "y++;\n");
- }
- return kgenAddStmt(ctx, tmp);
-}
-
-// unrolling generator for the f4zero function
-static int
-f4zeroSingle(struct KgenContext *ctx, void *priv)
-{
- DUMMY_ARG_USAGE(priv);
-
- return kgenAddStmt(ctx, "*data++ = 0;\n");
-}
-
-/*
- * Add statement setting initial local pointer for the work item
- *
- * @ld: lead dimension for the local block in float words;
- * if it's zero, the "ld" argument of a generated function is
- * used instead
- */
-static void
-addSettingPtrCode(
- struct KgenContext *ctx,
- const char *ptrName,
- size_t ld,
- bool transpose,
- const PGranularity *pgran,
- GenPriv *gpriv)
-{
- char tmp[4096];
- const char *vfield;
- const SubproblemDim *dim = gpriv->dim;
- const ItemWork *work = gpriv->work;
- size_t gsize;
-
- vfield = dtypeUPtrField(gpriv->dtype);
- gsize = pgran->wgSize[0] * pgran->wgSize[1];
-
- if (ld) {
- // offset between two rows and two elements in each row
- size_t roff, eoff;
-
- if (transpose) {
- roff = 1;
- eoff = ld;
- }
- else {
- roff = ld;
- eoff = 1;
- }
-
- if (dim->y < gsize) {
- sprintf(tmp, "%s.%s += (%s / %u) * %lu + (%s %% %u * %lu) * %lu;\n",
- ptrName, vfield, lidVarName, work->itemsPerRow,
- roff, lidVarName, work->itemsPerRow, work->nrCols, eoff);
- }
- else {
- sprintf(tmp, "%s.%s += %s * %lu * %lu;\n",
- ptrName, vfield, lidVarName, work->nrRows, roff);
- }
- }
- else {
- if (dim->y < gsize) {
- sprintf(tmp, "%s.%s += (startRow + %s / %u) * %s + "
- "startCol + %s %% %u * %lu;\n",
- ptrName, vfield, lidVarName, work->itemsPerRow,
- gpriv->globLDName, lidVarName, work->itemsPerRow, work->nrCols);
- }
- else {
- sprintf(tmp, "%s.%s += (startRow + %s * %lu) * %s + startCol;\n",
- ptrName, vfield, lidVarName, work->nrRows, gpriv->globLDName);
- }
- }
-
- kgenAddStmt(ctx, tmp);
- kgenAddBlankLine(ctx);
-}
-
-/*
- * Add statement setting initial coordinates pointer for image
- *
- */
-static void
-addSettingImageXYCode(
- struct KgenContext *ctx,
- const char *xName,
- const char *yName,
- const PGranularity *pgran,
- GenPriv *gpriv)
-{
- char tmp[4096];
- const ItemWork *work = gpriv->work;
- size_t gsize = pgran->wgSize[0] * pgran->wgSize[1];
-
- if (gpriv->packed) {
- sprintf(tmp, "pLine = ((get_image_width(dst) - startX) * %d / %lu) * %lu;\n",
- FLOAT4_VECLEN / gpriv->nfloats, gpriv->dim->x, gpriv->lmemLD);
- kgenAddStmt(ctx, tmp);
- if (gpriv->dim->y < gsize) {
- sprintf(tmp, "index = %s / %u;\n", lidVarName,
- work->itemsPerRow);
- }
- else {
- sprintf(tmp, "index = %s * %lu;\n", lidVarName,
- work->nrRows);
- }
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "x = startX + (index * %lu) %% pLine / %u;\n", gpriv->dim->x,
- FLOAT4_VECLEN / gpriv->nfloats);
- kgenAddStmt(ctx, tmp);
- if (gpriv->dim->y < gsize) {
- sprintf(tmp, "x += (%s %% %u) * (%lu / %u / %u);\n", lidVarName,
- work->itemsPerRow, gpriv->dim->x,
- (FLOAT4_VECLEN / gpriv->nfloats), work->itemsPerRow);
- kgenAddStmt(ctx, tmp);
- }
- sprintf(tmp, "y = startY + (index * %lu) / pLine;\n", gpriv->dim->x);
- kgenAddStmt(ctx, tmp);
- }
- else {
- if (gpriv->dim->y < gsize) {
- sprintf(tmp, "%s = startX + %s %% %u * %lu / %d;\n",
- xName, lidVarName, work->itemsPerRow, work->nrCols,
- FLOAT4_VECLEN/gpriv->nfloats);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "%s = startY + %s / %u;\n", yName, lidVarName,
- work->itemsPerRow);
- kgenAddStmt(ctx, tmp);
- }
- else {
- sprintf(tmp, "%s = startX;\n", xName);
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, "%s = startY + %s * %lu;\n", yName, lidVarName,
- gpriv->work->nrRows);
- kgenAddStmt(ctx, tmp);
- }
- }
-
- kgenAddBlankLine(ctx);
-}
-
-// generator working with subproblems of any dimension
-static int
-copyDBlockGenericGen(
- struct KgenContext *ctx,
- const PGranularity *pgran,
- GenPriv *gpriv)
-{
- char fpref;
- const char varPref[2] = {'G', 'L'};
- char tmp[1024];
- bool image;
- const char *s[3];
- int gdir;
- unsigned int i, n, gsize;
- const char *vfield;
- DataType dtype = gpriv->dtype;
-
- fpref = dtypeToPrefix(dtype);
- if (!fpref || (fpref == 'i')) {
- return -EINVAL;
- }
-
- image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
- gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
- s[0] = (gpriv->transp) ? "Transp" : "";
- vfield = dtypeUPtrField(dtype);
- n = FLOAT4_VECLEN / gpriv->nfloats;
- gsize = pgran->wgSize[0] * pgran->wgSize[1];
-
- if (image) {
- char srcStr[1024];
- s[1] = (gpriv->packed) ? "Pack" : "";
- if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
- sprintf(srcStr, "src.%s += (startRow + lid * n) *"
- " srcLD + startCol;\n", vfield);
- sprintf(tmp, copyMemGImgDBlockSlowDecl, fpref, s[1]);
- }
- else {
- sprintf(srcStr, "src.%s += srcLD * lid * n;\n", vfield);
- sprintf(tmp, copyMemLImgDBlockSlowDecl, fpref, s[1]);
- }
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
- sprintf(tmp, "int x, y;\n"
- "uint i, j, n, jb, jv;\n"
- "int lsize = %u;\n", gsize);
- kgenAddStmt(ctx, tmp);
- kgenDeclareLocalID(ctx, "lid", pgran);
- if (gpriv->packed) {
- char nLinesStr[1024];
- sprintf(nLinesStr,
- "nLines = (get_image_width(dst) - startX) * %d / nrCols;\n"
- "index = lid * n;\n", FLOAT4_VECLEN / gpriv->nfloats);
- sprintf(tmp, "int nLines, index;\n");
- kgenAddStmt(ctx, tmp);
- sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n,"",
- nLinesStr, srcStr);
- }
- else {
- sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n, "",
- "x = startX;\n" "y = startY + lid * n;\n", srcStr);
- }
- kgenAddStmt(ctx, tmp);
-
- gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) ? 0 : 1;
- if (gpriv->packed) {
- sprintf(tmp, copyMemImgDBlockPackedSlow, varPref[gdir],
- FLOAT4_VECLEN / gpriv->nfloats, vfield);
- }
- else {
- sprintf(tmp, copyMemImgDBlockSlow, varPref[gdir], vfield);
- }
- kgenAddStmt(ctx, tmp);
- }
- else {
- LoopCtl loopCtl;
- LoopUnrollers unrollers;
- char buf[3][256];
-
- memset(&loopCtl, 0, sizeof(loopCtl));
- memset(&unrollers, 0, sizeof(unrollers));
-
- s[1] = (gpriv->conjugate) ? "Conj" : "";
- s[2] = (gpriv->notVectorize) ? "Nvec" : "";
- gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
- sprintf(tmp, copyMemDBlockSlowDecl,
- fpref, s[0], s[1], s[2], varPref[gdir], varPref[1 - gdir],
- varPref[1 - gdir], varPref[gdir]);
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
- kgenDeclareLocalID(ctx, "lid", pgran);
- sprintf(tmp, "int lsize = %u;\n", gsize);
- kgenAddStmt(ctx, tmp);
-
- if (dtype == TYPE_COMPLEX_DOUBLE) {
- s[0] = "";
- s[1] = "";
- }
- else {
- s[0] = "uint js;\n";
- s[1] = (gpriv->transp || gpriv->conjugate) ? "float4 tmp;\n" : "";
- }
-
- // pass over rows or columns?
- i = (gpriv->transp && gdir) ? 1 : 0;
-
- if (dtype == TYPE_COMPLEX_DOUBLE) {
- buf[0][0] = '\0';
- }
- else {
- const char *boundName;
-
- // set counter bound to copy tail part, each work less than float4
- boundName = (i) ? "nrRows" : "nrCols";
-
- /*
- * FIXME: the kludge is introduced due to strange
- * runtime segfault at block transferring for another
- * data types. Verify it later. Now, for non float types
- * keep only simple loop.
- */
- if (i && (dtype != TYPE_FLOAT)) {
- gpriv->notVectorize = true;
- }
-
- if (gpriv->notVectorize) {
- sprintf(buf[0], "jb = 0;\n"
- "jv = 0;\n"
- "js = %s;\n",
- boundName);
- }
- else {
- sprintf(buf[0], "js = %s - jb * %u - jv * %u;\n",
- boundName, 4 * n, n);
- }
- }
-
- // set initial pointers
- if (!gdir) {
- sprintf(buf[1], "src.%s += (startRow + lid * n) * srcLD + "
- "startCol;\n", vfield);
- if (gpriv->transp) {
- sprintf(buf[2], "dst.%s += lid * n;\n", vfield);
- }
- else {
- sprintf(buf[2], "dst.%s += dstLD * lid * n;\n", vfield);
- }
- }
- else {
- if (gpriv->transp) {
- sprintf(buf[1], "src.%s += lid * n;\n", vfield);
- }
- else {
- sprintf(buf[1], "src.%s += srcLD * lid * n;\n", vfield);
- }
- sprintf(buf[2], "dst.%s += (startRow + lid * n) * dstLD + "
- "startCol;\n", vfield);
- }
-
- sprintf(tmp, copyMemSlowLvars, s[0], s[1],
- varPref[1 - gdir], varPref[gdir]);
- kgenAddStmt(ctx, tmp);
-
- sprintf(tmp, copyMemDBlockSlowStart[i],
- 4 * n, 4 * n, n, buf[0], buf[1], buf[2]);
- kgenAddStmt(ctx, tmp);
-
- // prepare to loop unrolling
- gpriv->srcName = "src1";
- gpriv->dstName = "dst1";
- if (gdir) {
- gpriv->locLDName = "srcLD";
- gpriv->globLDName = "dstLD";
- }
- else {
- gpriv->locLDName = "dstLD";
- gpriv->globLDName = "srcLD";
- }
-
- loopCtl.ocName = "j";
-
- if (gpriv->transp) {
- unrollers.genSingle = copyMemSingleTransp;
- if (dtype != TYPE_COMPLEX_DOUBLE) {
- unrollers.genSingleVec = copyMemVecTransp;
- }
- }
- else {
- unrollers.genSingle = copyMemSingle;
- if (dtype != TYPE_COMPLEX_DOUBLE) {
- unrollers.genSingleVec = copyMemVec;
- }
- }
-
- // external loop
- kgenBeginBranch(ctx, "for (i = 0; i < n; i++)");
- copyMemPreUnroll(ctx, gpriv);
-
- // finally, unroll all loops
- unrollers.getVecLen = getVecLen;
-
- // copying with 4 float4 words
- if (!gpriv->notVectorize) {
- loopCtl.outBound.name = "jb";
- loopCtl.inBound = 4 * n;
- kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
-
- // copying with float4 words
- loopCtl.outBound.name = "jv";
- loopCtl.inBound = n;
- kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
- }
-
- // copying the remaining tail
- if (dtype != TYPE_COMPLEX_DOUBLE) {
- unrollers.genSingleVec = NULL;
- loopCtl.outBound.name = "js";
- loopCtl.inBound = 1;
- kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv);
- }
-
- copyMemPostUnroll(ctx, gpriv);
- kgenEndBranch(ctx, NULL);
- }
-
- return kgenEndFuncBody(ctx);
-}
-
-// generator optimizing to a subproblem size
-static int
-copyDBlockOptimGen(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- const PGranularity *pgran,
- GenPriv *gpriv)
-{
- char fpref;
- const char varPref[2] = {'G', 'L'};
- char tmp[1024];
- // lead dimension for right and transposed local block in float words
- ItemWork work;
- LoopCtl loopCtl;
- LoopUnrollers unrollers;
- const char *s, *s1, *s2;
- bool image;
- SubproblemDim newDim;
- // copying direction within the memory or image related function group
- int gdir = 0;
- int r;
-
- fpref = dtypeToPrefix(gpriv->dtype);
- if (!fpref || (fpref == 'i')) {
- return -EINVAL;
- }
-
- image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
- gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);
-
- memset(&unrollers, 0, sizeof(unrollers));
- memset(&loopCtl, 0, sizeof(loopCtl));
- memset(&newDim, 0, sizeof(newDim));
-
- gpriv->dim = &newDim;
- gpriv->work = (const ItemWork*)&work;
- gpriv->globLDName = "ld";
- s = (gpriv->transp) ? "Transp" : "";
- s1 = (gpriv->conjugate) ? "Conj" : "";
- s2 = (gpriv->notVectorize) ? "Nvec" : "";
-
- if ((gpriv->dir == DBLOCK_LOCAL_TO_GLOBAL) && gpriv->transp) {
- // pass over columns of the block stored in the local memory
- newDim.x = dim->y;
- newDim.y = dim->x;
- }
- else {
- // pass over rows
- newDim.x = dim->x;
- newDim.y = dim->y;
- }
-
- getItemWork(&work, &newDim, pgran, gpriv->nfloats, gpriv->vecLen);
-
- if (image) {
- s = (gpriv->packed) ? "Pack" : "";
- if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
- sprintf(tmp, copyMemGImgDBlockDecl, fpref, s, dim->y, dim->x);
- }
- else {
- sprintf(tmp, copyMemLImgDBlockDecl, fpref, s, dim->y, dim->x);
- }
-
- }
- else {
- gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
- sprintf(tmp, copyMemDBlockDecl, fpref, s, s1, s2, varPref[gdir],
- varPref[1 - gdir], dim->y, dim->x, varPref[1 - gdir],
- varPref[gdir]);
- }
-
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
-
- kgenDeclareLocalID(ctx, lidVarName, pgran);
-
- if (image) {
- // data for loop unrolling
- if (work.nrRows > 1) {
- gpriv->srcName = "src1";
- gpriv->dstName = "dst";
- gpriv->imgXName="x1";
- gpriv->imgYName="y1";
- if(gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
- kgenAddStmt(ctx, "GPtr src1;\n");
- }
- else if(gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
- kgenAddStmt(ctx, "LPtr src1;\n");
- }
- kgenAddStmt(ctx, "int x1, y1;\n");
-
- unrollers.preUnroll = copyImgPreUnroll;
- unrollers.postUnroll = copyImgPostUnroll;
- }
- else {
- gpriv->srcName = "src";
- // dst has image2d_t type here
- gpriv->dstName = "dst";
- gpriv->imgXName="x";
- gpriv->imgYName="y";
- }
- }
- else {
- if ((gpriv->nfloats != FLOAT4_VECLEN) &&
- (gpriv->transp || gpriv->conjugate)) {
-
- /*
- * temporary variable to transpose or conjugate non double
- * complex elements
- */
- kgenAddStmt(ctx, "float4 tmp;\n");
- }
-
- if (work.nrRows > 1) {
- sprintf(tmp, privatePtrs, varPref[gdir], varPref[1 - gdir]);
- kgenAddStmt(ctx, tmp);
-
- // data for loop unrolling
- unrollers.preUnroll = copyMemPreUnroll;
- unrollers.postUnroll = copyMemPostUnroll;
- gpriv->srcName = "src1";
- gpriv->dstName = "dst1";
- }
- else {
- gpriv->srcName = "src";
- gpriv->dstName = "dst";
- }
- }
-
- if ((work.nrRows > 1) || work.nrItems) {
- prepareLoop(ctx, &work, &loopCtl);
- }
- kgenAddBlankLine(ctx);
- loopCtl.inBound = (unsigned long)work.nrCols;
-
- // now, prepare all needed for loop unrolling
-
- if (image) {
- kgenAddStmt(ctx, "int x, y;\n");
- if (gpriv->packed) {
- kgenAddStmt(ctx, "int pLine, index;\n");
- }
- gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
- FLOAT4_VECLEN / gpriv->nfloats;
- // set up starting x and y in image
- addSettingImageXYCode(ctx, "x", "y", pgran, gpriv);
-
- if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
- // set initial global pointer
- addSettingPtrCode(ctx, "src", 0, false, pgran, gpriv);
- }
- else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
- // set initial local pointer
- addSettingPtrCode(ctx, "src", gpriv->lmemLD, gpriv->transp,
- pgran, gpriv);
- }
-
- unrollers.genSingleVec = copyImgVec;
- unrollers.genSingle = copyImgSingle;
- }
- else {
- // set initial global pointer
- s = (gdir) ? "dst" : "src";
- addSettingPtrCode(ctx, s, 0, false, pgran, gpriv);
-
- s = (gdir) ? "src" : "dst";
-
- if (!gdir && gpriv->transp) {
- gpriv->lmemLD = fl4RowWidth(dim->y, gpriv->typeSize) *
- FLOAT4_VECLEN / gpriv->nfloats;
- }
- else {
- gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
- FLOAT4_VECLEN / gpriv->nfloats;
- }
-
- if (gpriv->transp) {
- unrollers.genSingleVec = (gpriv->notVectorize) ? NULL :
- copyMemVecTransp;
- unrollers.genSingle = copyMemSingleTransp;
- }
- else {
- unrollers.genSingleVec = (gpriv->notVectorize) ? NULL : copyMemVec;
- unrollers.genSingle = copyMemSingle;
- }
-
- addSettingPtrCode(ctx, s, gpriv->lmemLD, gpriv->transp,
- pgran, gpriv);
- }
- unrollers.getVecLen = getVecLen;
-
- // unroll for float4 aligned data chunk
- kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv);
-
- /*
- * Unroll for remaining data tail.
- * Block tail reading/writing is done separately
- * when many work items process single row
- * because the compiler don't like any conditional
- * branches in loops
- */
- if ((unrollers.postUnroll == NULL) && work.tail) {
- addCopyTailCode(ctx, gpriv);
- }
-
- r = kgenEndFuncBody(ctx);
-
- return r ? -EOVERFLOW : 0;
-}
-
-int
-copyDataBlockGen(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- const PGranularity *pgran,
- DataType dtype,
- DBlockCopyDirection dir,
- DBlockCopyFlags flags)
-{
- int r;
- GenPriv gpriv;
- unsigned int tsize;
-
- tsize = dtypeSize(dtype);
-
- if (dir == DBLOCK_LOCAL_TO_IMAGE ||
- dir == DBLOCK_GLOBAL_TO_IMAGE) {
- size_t rowSize;
-
- if (dim != NULL) {
- rowSize = tsize * dim->x;
- if (rowSize % sizeof(cl_float4) != 0) {
- // only float4 aligned rows are supported
- return -EINVAL;
- }
- }
- if (flags & DBLOCK_COPY_TRANSPOSE) {
- return -EINVAL;
- }
- }
-
- memset(&gpriv, 0, sizeof(gpriv));
- gpriv.transp = (flags & DBLOCK_COPY_TRANSPOSE);
- gpriv.packed = (flags & DBLOCK_COPY_PACKED_IMAGE);
- if (dtype != TYPE_COMPLEX_DOUBLE) {
- gpriv.notVectorize = (flags & DBLOCK_COPY_NOT_VECTORIZE);
- }
- if ((flags & DBLOCK_COPY_CONJUGATE) && isComplexType(dtype)) {
- gpriv.conjugate = true;
- }
- initGenPriv(&gpriv, dtype, tsize, dim ,dir, NULL, pgran);
-
- if (dim) {
- r = copyDBlockOptimGen(ctx, dim, pgran, &gpriv);
- }
- else {
- r = copyDBlockGenericGen(ctx, pgran, &gpriv);
- }
- return r;
-}
-
-int
-f4zeroBlockGen(
- struct KgenContext *ctx,
- const SubproblemDim *dim,
- const PGranularity *pgran,
- const char *memPrefix)
-{
- char tmp[1024];
- ItemWork work;
- LoopCtl loopCtl;
- GenPriv priv;
- char pref;
- LoopUnrollers unrollers;
-
- if (!strcmp(memPrefix, "__local")) {
- pref = 'l';
- }
- else if (!strcmp(memPrefix, "__global")) {
- pref = 'g';
- }
- else {
- return -EINVAL;
- }
-
- if (dim->y != 1) {
- return -EINVAL;
- }
-
- memset(&loopCtl, 0, sizeof(loopCtl));
- memset(&unrollers, 0, sizeof(unrollers));
- memset(&priv, 0, sizeof(GenPriv));
- initGenPriv(&priv, TYPE_COMPLEX_DOUBLE, FLOAT4_VECLEN * sizeof(cl_float),
- dim, 0, (const ItemWork*)&work, pgran);
- getItemWork(&work, dim, pgran, priv.nfloats, priv.vecLen);
-
- sprintf(tmp, f4zeroDecl, pref, dim->x, memPrefix);
- kgenDeclareFunction(ctx, tmp);
- kgenBeginFuncBody(ctx);
-
- // declare local ID variable and set data offset
- kgenDeclareLocalID(ctx, lidVarName, pgran);
- sprintf(tmp, "\ndata += %s * %lu;\n\n",
- lidVarName, work.nrCols);
- kgenAddStmt(ctx, tmp);
-
- unrollers.genSingle = f4zeroSingle;
- loopCtl.inBound = (unsigned int)work.nrCols;
- unrollers.getVecLen = getVecLen;
-
- kgenLoopUnroll(ctx, &loopCtl, TYPE_COMPLEX_DOUBLE, &unrollers, &priv);
- if (work.tail) {
- addTailCode(ctx, &priv, NULL, f4zeroSingle);
- }
-
- return kgenEndFuncBody(ctx);
-}