summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl')
-rw-r--r--external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl931
1 files changed, 0 insertions, 931 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl b/external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl
deleted file mode 100644
index 21af6ce5..00000000
--- a/external/clBLAS/src/library/blas/gens/clTemplates/trmv.cl
+++ /dev/null
@@ -1,931 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-
-// Column-Major Upper Case
-static const char *trmv_CU_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-#ifdef PACKED
- #define A( row, col) (*( A + ((col*(col+1))/2 + row)))
-#else
- #define A( row, col) A[ row + col * lda]
-#endif
-
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#define TARGET_WIDTH ((%BLOCKSIZE)/(TARGET_ROWS_BY_VEC))
-
-__kernel void %PREFIXtrmv_CU_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector, uint N,
- int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
-#ifdef HEMV_ONLY
-, int incy, uint offy, %TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global %TYPE const* x_vector;
- __global %TYPE* xnew;
- __global %TYPE const* restrict A;
-
- A = _A + offa;
- if ( incx < 0 ) // Goto end of vector
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx + ( N - 1) * abs(incx);
- #else
- x_vector = _x_vector + ( N - 1) * abs(incx);
- xnew = _xnew + (N - 1) * abs(incx) + offx;
- #endif
- }
- else
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx;
- #else
- x_vector = _x_vector;
- xnew = _xnew + offx;
- #endif
- }
-
- #ifdef HEMV_ONLY
- if(incy < 0)
- xnew = _xnew + offy + ( N - 1) * abs(incy);
- else
- xnew = _xnew + offy;
- #endif
-
-
- __local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
-
- const int gIdx = get_global_id(0);
- const int bIdx = get_group_id(0);
- const int threadIdx = get_local_id(0);
- const int TARGET_ROWS = %TARGET_ROWS;
-
- // Last block always targets the top rows
- // which may be less than or equal to 64
- int nBlocks = (N-1)/ %TARGET_ROWS + 1;
-
- if( bIdx == (nBlocks-1))
- {
- // Variables that don't change while looping
- int startRow = bIdx * %TARGET_ROWS;
- int destRow = (startRow + threadIdx) ;
- if( destRow >= N)
- {
- return;
- }
-
- //float acc = 0.0f;
- %TYPE acc = %MAKEVEC( 0.0);
- %TYPE accTemp = %MAKEVEC( 0.0);
-
- for ( int j= ( N - 1 ) ; j > destRow ; j--)
- {
- //acc += A( destRow, j) * x_vector[ j * incx];
- accTemp = A( destRow, j);
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ j * incx]);
- }
-
- if ( isUnity )
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, acc, alpha);
- temp = xnew[ destRow * incy];
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- %ADD(xnew[ destRow * incx] , acc, x_vector[ destRow * incx]);
- #endif
- }
- else
- {
- //xnew[ destRow * incx] = acc + A( destRow , destRow) * x_vector[ destRow * incx];
- accTemp = A( destRow, destRow);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- // accTemp.odd = 0.0f;
- %CLEAR_IMAGINARY( accTemp );
- #endif
- #endif
-
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ destRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ destRow * incy], beta);
- %MUL(acc1, acc, alpha);
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- xnew[ destRow * incx] = acc;
- #endif
- }
- }
- else
- {
- %TYPE sumTemp= %MAKEVEC( 0.0);
- %TYPE%V sum = %VMAKEVEC( sumTemp);
-
- // Variables that don't change while looping
- int startRow = bIdx * %TARGET_ROWS;
- //int rowShift = ((threadIdx & ( TARGET_ROWS_BY_VEC -1 )) * %V);
- int rowShift = ((threadIdx % (TARGET_ROWS_BY_VEC)) * %V);
- int colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- int row = startRow + rowShift;
-
- // gIdx is not destination row.
-
- // startRow may be less than 4
- // So nLoops will be negative
- // and the FOR loop doesn't execute
- int nLoops = (( N - (startRow + %TARGET_ROWS))/ TARGET_WIDTH) - 1;
-
- for( int j=0; j <= (nLoops); j++)
- {
- int startCol = N - (j + 1)* TARGET_WIDTH;
- int col = startCol + colShift;
-
- //
- // Only TARGET_WIDTH threads points are to be read from X-vector
- // We dont't use VLOAD here because incx could be > 1
- // Minimal prototyping shows that having separate loading code
- // for incx value of 1 does not change anything in performance
- // In fact, the extra IF costs us.
- //
- barrier(CLK_LOCAL_MEM_FENCE);
- if (threadIdx < TARGET_WIDTH)
- {
- sXData[threadIdx] = x_vector[(startCol + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // TARGET_ROWS_BY_VEC way bank-conflict : May broadcast if TARGET_ROWS = BLOCKSIZE, which reduces occupancy
- // And we loose performance as we don't have enough blocks to hide memory access and compute latenties per MP
- %TYPE xData = sXData[colShift];
-
- //sum += vload4(0, &A( row, col)) * ((float4)( xData, xData, xData, xData));
- // ((float4)( xData, xData, xData, xData));
- %TYPE%V loadedA = %VLOAD(0, (&A( row, col)));
- %CONJUGATE(doConj, loadedA);
-
- %TYPE%V xDataTemp = %VMAKEVEC(xData);
- %VMAD( sum, loadedA, xDataTemp);
- }
-
-
- volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
- volatile __local %TYPE* sData = sDataTemp;
- //sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Reduce each block by DTARGET_ROWS threads to generate DTARGET_ROWS acc values
- if ( threadIdx < %TARGET_ROWS)
- {
- //float acc = 0.0f;
- %TYPE acc = %MAKEVEC( 0.0);
- %TYPE accTemp = %MAKEVEC( 0.0);
- int desRow = (bIdx * %TARGET_ROWS)+ threadIdx;
-
- //#pragma unroll TARGET_WIDTH
- for( int j=0; j < TARGET_WIDTH; j++)
- {
- //acc += sData[ threadIdx + j * FTARGET_ROWS];
- %ADD(acc, acc, sData[ threadIdx + j * TARGET_ROWS]);
- }
-
- for ( int j= (N - (nLoops+1)* TARGET_WIDTH - 1) ; j > desRow; j--)
- {
- //acc += A( desRow, j) * x_vector[ j * incx];
- accTemp = A( desRow, j);
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ j * incx]);
- }
-
- if ( isUnity )
- {
- //%ADD(xnew[ desRow * incx], acc, x_vector[ desRow * incx]);
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, acc, alpha);
- temp = xnew[ desRow * incy];
- %ADD(xnew[ desRow * incy], temp, acc1);
- #else
- %ADD(xnew[ desRow * incx] , acc, x_vector[ desRow * incx]);
- #endif
- }
- else
- {
- // xnew[ desRow * incx] = acc + A( desRow, desRow) * x_vector[ desRow * incx];
- accTemp = A( desRow, desRow );
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //accTemp.odd = 0.0f;
- %CLEAR_IMAGINARY( accTemp );
- #endif
- #endif
-
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ desRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ desRow * incy], beta);
- %MUL(acc1, acc, alpha);
- %ADD(xnew[ desRow * incy], temp, acc1);
- #else
- xnew[ desRow * incx] = acc;
- #endif
- }
- }
- barrier(CLK_GLOBAL_MEM_FENCE);
- }
-}";
-
-// Column-Major Lower Case
-
-static const char *trmv_CL_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#ifdef PACKED
- #define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
-#else
- #define A( row, col) A[ row + col * lda]
-#endif
-
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#define TARGET_WIDTH ((%BLOCKSIZE)/(TARGET_ROWS_BY_VEC))
-__kernel void %PREFIXtrmv_CL_kernel( __global %TYPE const* restrict _A, __global %TYPE* _xnew, __global %TYPE const* restrict _x_vector,
- uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
-#ifdef HEMV_ONLY
-, int incy, uint offy, %TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global %TYPE* x_vector;
- __global %TYPE* xnew;
- __global %TYPE const * restrict A;
-
- A = _A + offa;
- if ( incx < 0 ) // Goto end of vector
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx + ( N - 1) * abs(incx);
- #else
- x_vector = _x_vector + ( N - 1) * abs(incx);
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- #endif
- }
- else
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx;
- #else
- x_vector = _x_vector;
- xnew = _xnew + offx;
- #endif
- }
-
-
- #ifdef HEMV_ONLY
- if(incy < 0)
- xnew = _xnew + offy + ( N - 1) * abs(incy);
- else
- xnew = _xnew + offy;
- #endif
-
- __local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
-
- size_t gIdx = get_global_id(0);
- size_t bIdx = get_group_id(0);
- size_t threadIdx = get_local_id(0);
- int TARGET_ROWS = %TARGET_ROWS;
-
- // Last block always targets the top rows
- // which may be less than or equal to 64
- size_t nBlocks = (N-1)/ %TARGET_ROWS + 1;
-
-
- if( bIdx == (nBlocks-1))
- {
- // Target row of xNew is given by threadIdx
- size_t lastRow = (N - (nBlocks -1) * %TARGET_ROWS) -1;
-
- if( threadIdx > lastRow )
- {
- return;
- }
-
- //float acc = 0.0f;
- %TYPE acc = %MAKEVEC( 0.0);
- %TYPE accTemp = %MAKEVEC( 0.0);
-
- for ( int j= 0 ; j < threadIdx; j++)
- {
- //acc += A(threadIdx, j) * x_vector[ j * incx];
- accTemp = A(threadIdx, j);
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ j * incx]);
- }
-
- if ( isUnity )
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, acc, alpha);
- temp = xnew[ threadIdx * incy];
- %ADD(xnew[ threadIdx * incy], temp, acc1);
- #else
- %ADD(xnew[ threadIdx * incx] , acc, x_vector[ threadIdx * incx]);
- #endif
- }
- else
- { //xnew[ threadIdx * incx] = acc + A(threadIdx, threadIdx) * x_vector[ threadIdx * incx];
- accTemp = A(threadIdx, threadIdx);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //accTemp.odd = 0.0f;
- %CLEAR_IMAGINARY( accTemp );
- #endif
- #endif
-
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ threadIdx * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ threadIdx * incy], beta);
- %MUL(acc1, acc, alpha);
- %ADD(xnew[ threadIdx * incy], temp, acc1);
- #else
- xnew[ threadIdx * incx] = acc;
- #endif
- }
- }
- else
- {
- %TYPE sumTemp= %MAKEVEC( 0.0);
- %TYPE%V sum = %VMAKEVEC( sumTemp);
-
- // Variables that don't change while looping
- size_t startRow = N - (bIdx + 1)* %TARGET_ROWS;
- //size_t rowShift = ((threadIdx & ( TARGET_ROWS_BY_VEC -1 )) * %V);
- size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- size_t row = startRow + rowShift;
-
- // gIdx is not destination row.
- size_t desRow = startRow + threadIdx;
-
- // startRow may be less than 4
- // So nLoops will be negative
- // and the FOR loop doesn't execute
- int nLoops = ( startRow / TARGET_WIDTH) - 1;
-
- for( int j=0; j <= (nLoops); j++)
- {
- size_t startCol = j * TARGET_WIDTH;
- size_t col = startCol + colShift;
-
- //
- // Only TARGET_WIDTH threads points are to be read from X-vector
- // We dont't use VLOAD here because incx could be > 1
- // Minimal prototyping shows that having separate loading code
- // for incx value of 1 does not change anything in performance
- // In fact, the extra IF costs us.
- //
- barrier(CLK_LOCAL_MEM_FENCE);
- if (threadIdx < TARGET_WIDTH)
- {
- sXData[threadIdx] = x_vector[(startCol + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // TARGET_ROWS_BY_VEC way bank-conflict : May broadcast if TARGET_ROWS = BLOCKSIZE, which reduces occupancy
- // And we loose performance as we don't have enough blocks to hide memory access and compute latenties per MP
- %TYPE xData = sXData[colShift];
-
- //sum += vload4(0, &A( row, col)) * ((float4)( xData, xData, xData, xData));
- // ((float4)( xData, xData, xData, xData));
- %TYPE%V loadedA = %VLOAD(0, (&A( row, col)));
- %CONJUGATE(doConj, loadedA);
-
- %TYPE%V xDataTemp = %VMAKEVEC(xData);
- %VMAD(sum, loadedA, xDataTemp);
- }
-
-
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
- __local %TYPE* sData = sDataTemp;
- //sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Reduce each block by DTARGET_ROWS threads to generate DTARGET_ROWS acc values
- if ( threadIdx < %TARGET_ROWS)
- {
- //float acc = 0.0f;
- %TYPE acc = %MAKEVEC( 0.0);
- %TYPE accTemp = %MAKEVEC( 0.0);
-
- //#pragma unroll TARGET_WIDTH
- for( int j=0; j < TARGET_WIDTH; j++)
- {
- //acc += sData[ threadIdx + j * FTARGET_ROWS];
- %ADD(acc, acc, sData[ threadIdx + j * TARGET_ROWS]);
- }
-
- for ( int j= ((nLoops+1)* TARGET_WIDTH) ; j < desRow; j++)
- {
- //acc += A(desRow, j) * x_vector[ j * incx];
- accTemp = A(desRow, j);
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ j * incx]);
- }
-
- if ( isUnity )
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, acc, alpha);
- temp = xnew[ desRow * incy];
- %ADD(xnew[ desRow * incy], temp, acc1);
- #else
- %ADD(xnew[ desRow * incx] , acc, x_vector[ desRow * incx]);
- #endif
- }
- else
- {
- // xnew[ desRow * incx] = acc + A(desRow, desRow) * x_vector[ desRow * incx];
- accTemp = A(desRow, desRow);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //accTemp.odd = 0.0f;
- %CLEAR_IMAGINARY( accTemp );
- #endif
- #endif
-
- %CONJUGATE(doConj, accTemp);
- %MAD(acc, accTemp, x_vector[ desRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ desRow * incy], beta);
- %MUL(acc1, acc, alpha);
- %ADD(xnew[ desRow * incy], temp, acc1);
- #else
- xnew[ desRow * incx] = acc;
- #endif
- }
- }
- }
-}";
-
-// Column-Major Lower Transpose
-// Threads : %PREFIXBLOCKSIZET, Blocks launched = (N -1) / %PREFIXTARGET_ROWST + 1
-/*
-#define %PREFIXVECTOR_SIZET %V
-#define %PREFIXTARGET_WIDTH_BY_VECT ( %PREFIXBLOCKSIZET / %PREFIXTARGET_ROWST )
-#define %PREFIXTARGET_WIDTHT ( %PREFIXTARGET_WIDTH_BY_VECT * %PREFIXVECTOR_SIZET )
-*/
-
-static const char *trmv_CLT_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-#ifdef PACKED
- #define A( row, col) (*( A + ((( col *((2*N) + 1 - col)) / 2) + (row - col))))
-#else
- #define A( row, col) A[ row + col * lda]
-#endif
-
-#define TARGET_WIDTH_BY_VEC ((%BLOCKSIZE) / (%TARGET_ROWS) )
-#define TARGET_WIDTH ((TARGET_WIDTH_BY_VEC) * (%V))
-__kernel void %PREFIXtrmv_CLT_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector,
- uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
-#ifdef HEMV_ONLY
-, int incy, uint offy, %TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global %TYPE* x_vector;
- __global %TYPE* xnew;
- __global %TYPE const * restrict A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx + ( N - 1) * abs(incx);
- #else
- x_vector = _x_vector + ( N - 1) * abs(incx);
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- #endif
- }
- else
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx;
- #else
- x_vector = _x_vector;
- xnew = _xnew + offx;
- #endif
- }
-
-
- #ifdef HEMV_ONLY
- if(incy < 0)
- xnew = _xnew + offy + ( N - 1) * abs(incy);
- else
- xnew = _xnew + offy;
- #endif
-
- int gIdx = get_global_id(0);
- int blockIdx = get_group_id(0);
- int blockDim = get_local_size(0);
- int threadIdx = get_local_id(0);
-
- __local %TYPE xShared[TARGET_WIDTH];
-
- int startCol = blockIdx * %TARGET_ROWS;
-
- %TYPE accTemp= %INIT( 0.0);
- %TYPE%V acc = %VMAKEVEC( accTemp);
-
- //size_t rowShift = ((threadIdx & ( TARGET_WIDTH_BY_VEC -1 )) * %V);
- size_t rowShift = ((threadIdx % ( TARGET_WIDTH_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_WIDTH_BY_VEC;
- size_t col = startCol + colShift;
- int startRow;
-
- for( startRow = (N - TARGET_WIDTH); ( startCol + %TARGET_ROWS - 1 ) < startRow; startRow = (startRow - TARGET_WIDTH))
- {
- // Load X data into Shared memory
- barrier(CLK_LOCAL_MEM_FENCE);
- if (threadIdx < TARGET_WIDTH)
- {
- xShared[threadIdx] = x_vector[ (startRow + threadIdx) * incx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
- %TYPE%V xData;
- __local %TYPE%V* xSharedTemp = (xShared + rowShift);
- xData = *(xSharedTemp);
-
- int row = startRow + rowShift;
- //acc += vload4(0, &A(row, col)) * xData;
- %TYPE%V loadedA = %VLOAD( 0, (&A(row, col)) );
- %CONJUGATE(doConj, loadedA);
- %VMAD(acc, loadedA, xData);
- }
- // Restore startRow
- startRow += TARGET_WIDTH;
-
- __local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
- __local %TYPE* sData = sDataTemp;
-
- // blocks that did vectorLoads
- bool vectorBlocks = ( startRow != N);
- if ( vectorBlocks )
- {
-
- //sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
- sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- }
-
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
- if( threadIdx < %TARGET_ROWS && ( (startCol + threadIdx) < N))
- {
- if ( vectorBlocks )
- {
- //#pragma unroll TARGET_WIDTH
- for( int i=0 ; i < TARGET_WIDTH; i++)
- {
- %ADD(sum, sum, sData[i + (threadIdx * TARGET_WIDTH )]);
- }
-
- }
-
- int destRow = blockIdx * %TARGET_ROWS + threadIdx;
-
- // Loop from startRow - 1 till destRow
- for( int i= ( startRow - 1); i > destRow; i--)
- {
- loadedA = A(i, destRow);
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, x_vector[ i * incx]);
- }
- if ( isUnity)
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, sum, alpha);
- temp = xnew[ destRow * incy];
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- %ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
- #endif
- }
- else
- {
- loadedA = A(destRow, destRow);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //loadedA.odd = 0.0f;
- %CLEAR_IMAGINARY( loadedA );
- #endif
- #endif
-
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, x_vector[ destRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ destRow * incy], beta);
- %MUL(acc1, sum, alpha);
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- xnew[ destRow * incx] = sum;
- #endif
- }
- }
-}";
-
-
-
-// Column-Major Upper Transpose
-// Threads : %PREFIXBLOCKSIZET, Blocks launched = (N -1) / %PREFIXTARGET_ROWST + 1
-static const char *trmv_CUT_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-#ifdef PACKED
- #define A( row, col) (*( A + ((col*(col+1))/2 + row)))
-#else
- #define A( row, col) A[ row + col * lda]
-#endif
-
-#define TARGET_WIDTH_BY_VEC ((%BLOCKSIZE) / (%TARGET_ROWS) )
-#define TARGET_WIDTH ((TARGET_WIDTH_BY_VEC) * (%V))
-
-__kernel void %PREFIXtrmv_CUT_kernel( __global %TYPE const* restrict _A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector,
- uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx
-#ifdef HEMV_ONLY
-, int incy, uint offy, %TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global %TYPE* x_vector;
- __global %TYPE* xnew;
- __global %TYPE const* restrict A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx + ( N - 1) * abs(incx);
- #else
- x_vector = _x_vector + ( N - 1) * abs(incx);
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- #endif
- }
- else
- {
- #ifdef HEMV_ONLY
- x_vector = _x_vector + offx;
- #else
- x_vector = _x_vector;
- xnew = _xnew + offx;
- #endif
- }
-
-
- #ifdef HEMV_ONLY
- if(incy < 0)
- xnew = _xnew + offy + ( N - 1) * abs(incy);
- else
- xnew = _xnew + offy;
- #endif
-
- int gIdx = get_global_id(0);
- int blockIdx = get_group_id(0);
- int blockDim = get_local_size(0);
- int threadIdx = get_local_id(0);
-
- __local %TYPE xShared[TARGET_WIDTH];
-
- int startRow = 0;
- int startCol = N - (blockIdx + 1)* %TARGET_ROWS;
-
- // Do scalar if this condition is true
- if ( (startRow + TARGET_WIDTH - 1 ) >= startCol)
- {
- int destRow = (startCol + threadIdx) ;
-
- if( (threadIdx < %TARGET_ROWS) && ( destRow >= 0))
- {
- %TYPE sum = %MAKEVEC(0.0);
- %TYPE accTemp = %MAKEVEC(0.0);
-
- // Loop from (startRow - 1) till destRow
- for( int i= 0; i < destRow; i++)
- {
- accTemp = A(i, destRow);
- %CONJUGATE(doConj, accTemp);
- %MAD(sum, accTemp, x_vector[ i * incx]);
- }
- if ( isUnity)
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, sum, alpha);
- temp = xnew[ destRow * incy];
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- %ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
- #endif
- }
- else
- {
- accTemp = A(destRow, destRow);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //accTemp.odd = 0.0f;
- %CLEAR_IMAGINARY( accTemp );
- #endif
- #endif
-
- %CONJUGATE(doConj, accTemp);
- %MAD(sum, accTemp, x_vector[ destRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ destRow * incy], beta);
- %MUL(acc1, sum, alpha);
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- xnew[ destRow * incx] = sum;
- #endif
- }
- }
- }
- else
- {
- // float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
- %TYPE accTemp = %MAKEVEC( 0.0);
- %TYPE%V acc = %VMAKEVEC(accTemp);
-
- //size_t rowShift = ((threadIdx & ( TARGET_WIDTH_BY_VEC -1 )) * %V);
- size_t rowShift = ((threadIdx % ( TARGET_WIDTH_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_WIDTH_BY_VEC;
-
- int col = startCol + colShift;
-
- for( int i=0; ; i++)
- {
- // Load X data into Shared memory
- barrier(CLK_LOCAL_MEM_FENCE);
- if (threadIdx < TARGET_WIDTH)
- {
- xShared[threadIdx] = x_vector[ (startRow + threadIdx) * incx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
- %TYPE%V xData;
- __local %TYPE%V* xSharedTemp = (xShared + rowShift);
- xData = *(xSharedTemp);
-
- int row = startRow + rowShift;
- // acc += vload4(0, &A(row,col)) * xData;
- %TYPE%V loadedA = %VLOAD( 0, (&A(row, col)));
- %CONJUGATE(doConj, loadedA);
- %VMAD(acc, loadedA, xData);
-
- startRow = startRow + TARGET_WIDTH;
- if ( (startRow + TARGET_WIDTH - 1) >= startCol)
- {
- break;
- }
- }
-
- //__local float4 sData[16][4];
- //sData[(threadIdx & 15)][colShift] = acc;
- //barrier(CLK_LOCAL_MEM_FENCE);
- __local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
- __local %TYPE* sData = sDataTemp;
-
- //sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
- sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
-
- if( threadIdx < %TARGET_ROWS )
- {
- //#pragma unroll TARGET_WIDTH
- for( int i=0 ; i < TARGET_WIDTH; i++)
- {
- %ADD(sum, sum, sData[i + (threadIdx * TARGET_WIDTH )]);
- }
-
- int destRow = (startCol + threadIdx) ;
-
- // Loop from startRow - 1 till destRow
- for( int i= startRow; i < destRow; i++)
- {
- loadedA = A(i, destRow);
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, x_vector[ i * incx]);
- }
- if ( isUnity)
- {
- #ifdef HEMV_ONLY
- %TYPE acc1, temp;
- %MUL(acc1, sum, alpha);
- temp = xnew[ destRow * incy];
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- %ADD(xnew[ destRow * incx] , sum, x_vector[ destRow * incx]);
- #endif
- }
- else
- {
- loadedA = A(destRow, destRow);
-
- #ifdef HEMV_ONLY
- #ifndef SPMV_ONLY
- //loadedA.odd = 0.0f;
- %CLEAR_IMAGINARY( loadedA );
- #endif
- #endif
-
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, x_vector[ destRow * incx]);
-
- #ifdef HEMV_ONLY
- %TYPE temp, acc1;
- %MUL(temp, xnew[ destRow * incy], beta);
- %MUL(acc1, sum, alpha);
- %ADD(xnew[ destRow * incy], temp, acc1);
- #else
- xnew[ destRow * incx] = sum;
- #endif
- }
- }
- }
-}";
-
-