summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl')
-rw-r--r--external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl1487
1 files changed, 0 insertions, 1487 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl b/external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl
deleted file mode 100644
index f1f6cd4f..00000000
--- a/external/clBLAS/src/library/blas/gens/clTemplates/trsv_gemv.cl
+++ /dev/null
@@ -1,1487 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-// Compute Rectangle + Traingle
-
-const char * trsv_CU_ComputeRectangle_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#ifdef PACKED
- #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
-#else
- #define A( row, col) A[ (row) + (col) * lda]
-#endif
-
-__kernel void %PREFIXtrsv_CU_ComputeRectangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
-{
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- size_t bIdx = get_group_id(0);
- size_t threadIdx= get_local_id(0);
-
- // Get total blocks launched
- size_t nBlocks = ((rowsLeft - 1) / %TARGET_ROWS) + 1;
-
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
- // First Block does scalar stuff...
- // Only this gets executed if nBlocks == 1
- if ( bIdx == 0)
- {
- int targetCol = startCol;
- int targetRow = threadIdx;
- int lastRow = rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
-
- if ( nBlocks > 1)
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol--;
- }
-
- %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
- }
- }
- else // Solve the traingle -- no more kernel launches required
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol--;
- }
- }
-
- // Change targetCol to point to Triangle last column for all threads
- // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
- targetCol = startCol - %TARGET_ROWS;
-
- __local %TYPE xShared; // To share solved x value with other threads..
-
- for( int i=0; i < (lastRow + 1); i++)
- {
- if ( targetRow == targetCol)
- {
- %TYPE xVal = xnew[ targetRow * incx];
- %SUB(sum, xVal, sum);
- xShared = sum;
- xnew[ targetRow * incx ] = xShared;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( targetRow < targetCol)
- {
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
- }
-
- // Avoid Race
- barrier(CLK_LOCAL_MEM_FENCE);
- targetCol--;
- }
- }
- }
- else
- {
- size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- int rowStart = rowsLeft - ( %TARGET_ROWS * (nBlocks - bIdx) );
- int row = rowStart + rowShift;
-
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sum = %VMAKEVEC(sumTemp);
-
- __local %TYPE xData[ %TARGET_WIDTH];
-
- //#pragma unroll
- for( int i=1; i <= %NLOOPS; i++)
- {
- // Put startCol to start of BLOCKSIZE Block
- int startColp = startCol - (%TARGET_WIDTH * i) + 1;
-
- if ( threadIdx < %TARGET_WIDTH)
- {
- xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int col = startColp + colShift;
-
- %TYPE xDataVal = xData[ colShift ];
- %TYPE%V xDataVec= %VMAKEVEC( xDataVal);
-
- %TYPE%V loadedA = %VLOAD( 0, &A((row), (col)));
- %CONJUGATE(doConj, loadedA);
- %VMAD(sum, loadedA, xDataVec);
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //int TARGET_ROWS = %TARGET_ROWS;
-
- // Last Block
- // Do Scalar reduction for last block
- // Followed by solving the triangle
- if ( bIdx == ( nBlocks - 1))
- {
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sumVec = %VMAKEVEC(sumTemp);
- %TYPE%V loadedAVec = %VMAKEVEC(sumTemp);
-
- //int targetRow = rowStart + threadIdx;
- int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps
-
- // Do vector reduction
- if ( threadIdx < TARGET_ROWS_BY_VEC )
- {
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
- }
-
- __local %TYPE xShared[%V];
-
- int targetRowTemp = rowStart + threadIdx * %V;
- int VECTOR_SIZE = %V;
-
- //#pragma unroll
- for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
- {
- if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i))
- {
- // Read X-vector
- %TYPE xVal[%V];
- //#pragma unroll
- for( int j = 0; j < %V; j++)
- {
- xVal[j] = xnew[ (targetRowTemp + j)* incx];
- }
-
- // Read A %Vx%V region into reg
- %TYPE reg[%V][%V];
- //#pragma unroll
- for( int idx = 0; idx < ( %V * %V); idx++)
- {
- int m = idx / ( %V ); // Row : Col-Major idx...
- int n = idx % ( %V ); // Col
- if ( n > m )
- {
- reg[m][n] = A( (targetRowTemp + m), (targetCol -( %V - 1 - n)));
- %CONJUGATE(doConj, reg[m][n]);
- }
- }
-
- %TYPE sumVecReg[%V];
- %VSTOREWITHINCX(sumVecReg, sumVec, 1);
-
- // Solve for first x - Do the rest in loop
- %TYPE x[%V];
- %SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
- xShared[%V - 1] = x[%V - 1];
- xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
-
- //#pragma unroll
- for(int m= ( %V - 2); m >=0; m--)
- {
- %SUB(x[m], xVal[m], sumVecReg[m]);
- }
-
- //#pragma unroll
- for( int idx = (( ( %V * %V) - 1) - %V); idx > 0; idx--)
- {
- int m = idx / %V; // Row : Row-Major idx, x[3] is solved before x[2]
- int n = idx % ( %V );// Col
- if ( n > m)
- {
- //x[m] = x[m] - reg[m][n] * x[n];
- %MAD(x[m], reg[m][n], (-x[n]));
- }
- }
-
- // Store results
- //#pragma unroll
- for(int m = 0; m < %V; m++)
- {
- xShared[m] = x[m];
- xnew[ (targetRowTemp + m)* incx ] = x[m];
- }
- }
-
-
- // Sync so that xShared it available to all threads
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
- {
- //#pragma unroll
- for( int j=0; j < %V; j++)
- {
- //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
- %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
- }
- }
-
- targetCol = targetCol - %V;
- // Avoid Race...
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- else
- {
- // Do Vector Reduction on each block except the last Block
- if ( threadIdx < TARGET_ROWS_BY_VEC)
- {
- %TYPE accTemp = %MAKEVEC(0.0);
- %TYPE%V acc = %VMAKEVEC(accTemp);
-
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
-
- // Store the result
- int targetRow = rowStart + threadIdx * %V;
-
- __global %TYPE* xNewPtr = xnew + targetRow * incx;
- //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
- %TYPE%V value;
- %VLOADWITHINCX(value, xNewPtr, incx);
-
- // Compute result
- %SUB(value, value, acc);
-
- // Store results
- //VSTOREWITHINCX( xNewPtr, value, incx);
- %VSTOREWITHINCX(xNewPtr, value, incx);
- }
- }
- }
-}
-";
-
-const char *trsv_CU_ComputeRectangle_NonUnity_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#ifdef PACKED
- #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
-#else
- #define A( row, col) A[ (row) + (col) * lda]
-#endif
-// Compute Rectangle + Traingle
-__kernel void %PREFIXtrsv_CU_ComputeRectangle_NonUnity_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
-{
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- size_t bIdx = get_group_id(0);
- size_t threadIdx= get_local_id(0);
-
- // Get total blocks launched
- size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1;
-
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
- // First Block does scalar stuff...
- // Only this gets executed if nBlocks == 1
- if ( bIdx == 0)
- {
- int targetCol = startCol;
- int targetRow = threadIdx;
- int lastRow = rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
-
- if ( nBlocks > 1)
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol--;
- }
-
- %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
- }
- }
- else // Solve the traingle -- no more kernel launches required
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol--;
- }
- }
-
- // Change targetCol to point to Triangle last column for all threads
- // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
- targetCol = startCol - %TARGET_ROWS;
-
- __local %TYPE xShared; // To share solved x value with other threads..
-
- for( int i=0; i < (lastRow + 1); i++)
- {
- if ( targetRow == targetCol)
- {
- %TYPE xVal = xnew[ targetRow * incx];
- sum = xVal - sum;
-
- // Handle diagonal element
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
-
- xnew[ targetRow * incx ] = xShared;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( targetRow < targetCol)
- {
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
- }
-
- // Avoid Race
- barrier(CLK_LOCAL_MEM_FENCE);
- targetCol--;
- }
- }
- }
- else
- {
- size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- int rowStart = rowsLeft - ( %TARGET_ROWS * (nBlocks - bIdx) );
- int row = rowStart + rowShift;
-
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sum = %VMAKEVEC(sumTemp);
-
- __local %TYPE xData[ %TARGET_WIDTH];
-
- //#pragma unroll
- for( int i=1; i <= %NLOOPS; i++)
- {
- // Put startCol to start of BLOCKSIZE Block
- int startColp = startCol - (%TARGET_WIDTH * i) + 1;
-
- if ( threadIdx < %TARGET_WIDTH)
- {
- xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int col = startColp + colShift;
-
- %TYPE xDataVal = xData[ colShift ];
- %TYPE%V xDataVec= %VMAKEVEC( xDataVal);
-
- %TYPE%V loadedA = %VLOAD( 0, &A((row), (col)));
- %CONJUGATE(doConj, loadedA);
- %VMAD(sum, loadedA, xDataVec);
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //int TARGET_ROWS = %TARGET_ROWS;
-
- // Last Block
- // Do Scalar reduction for last block
- // Followed by solving the triangle
- if ( bIdx == ( nBlocks - 1))
- {
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sumVec = %VMAKEVEC(sumTemp);
- %TYPE%V loadedAVec = %VMAKEVEC(sumTemp);
-
- //int targetRow = rowStart + threadIdx;
- int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps
-
- // Do vector reduction
- if ( threadIdx < TARGET_ROWS_BY_VEC )
- {
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
- }
-
- __local %TYPE xShared[%V];
-
- int targetRowTemp = rowStart + threadIdx * %V;
- int VECTOR_SIZE = %V;
-
- //#pragma unroll
- for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
- {
- if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i))
- {
- // Read X-vector
- %TYPE xVal[%V];
- //#pragma unroll
- for( int j = 0; j < %V; j++)
- {
- xVal[j] = xnew[ (targetRowTemp + j)* incx];
- }
-
- // Read A %Vx%V region into reg
- %TYPE reg[%V][%V];
- //#pragma unroll
- for( int idx = 0; idx < ( %V * %V); idx++)
- {
- int m = idx % ( %V ); // Row : Col-Major idx...
- int n = idx / ( %V ); // Col
- if ( n >= m )
- {
- reg[m][n] = A((targetRowTemp + m), (targetCol -( %V - 1 - n)));
- %CONJUGATE(doConj, reg[m][n]);
- }
- }
-
- %TYPE sumVecReg[%V];
- %VSTOREWITHINCX(sumVecReg, sumVec, 1);
-
- // Solve for first x - Do the rest in loop
- %TYPE x[%V];
- %SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
- %DIV(sumVecReg[VECTOR_SIZE - 1], x[VECTOR_SIZE -1], reg[VECTOR_SIZE - 1][VECTOR_SIZE - 1]);
- x[VECTOR_SIZE -1] = sumVecReg[VECTOR_SIZE - 1];
- xShared[%V - 1] = x[%V - 1];
- xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
-
- //#pragma unroll
- for(int m = ( %V - 2); m >=0; m--)
- {
- %SUB(x[m], xVal[m], sumVecReg[m]);
- }
-
- //#pragma unroll
- for( int idx = (( ( %V * %V) - 1) - %V); idx >= 0; idx--)
- {
- int m = idx / %V; // Row : Row-Major idx, x[3] is solved before x[2]
- int n = idx % ( %V );// Col
- if ( n > m)
- {
- //x[m] = x[m] - reg[m][n] * x[n];
- %MAD(x[m], reg[m][n], (-x[n]));
- }
- else if ( m == n)
- {
- %DIV(sumVecReg[m], x[m], reg[m][m]);
- x[m] = sumVecReg[m];
- }
- }
-
- // Store results
- //#pragma unroll
- for(int m = 0; m < %V; m++)
- {
- xShared[m] = x[m];
- xnew[ (targetRowTemp + m)* incx ] = x[m];
- }
- }
-
-
- // Sync so that xShared it available to all threads
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
- {
- //#pragma unroll
- for( int j=0; j < %V; j++)
- {
- //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
- %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
- }
- }
-
- targetCol = targetCol - %V;
- // Avoid Race...
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- else
- {
- // Do Vector Reduction on each block except the last Block
- if ( threadIdx < TARGET_ROWS_BY_VEC)
- {
- %TYPE accTemp = %MAKEVEC(0.0);
- %TYPE%V acc = %VMAKEVEC(accTemp);
-
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
-
- // Store the result
- int targetRow = rowStart + threadIdx * %V;
-
- __global %TYPE* xNewPtr = xnew + targetRow * incx;
- //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
- %TYPE%V value;
- %VLOADWITHINCX(value, xNewPtr, incx);
-
- // Compute result
- %SUB(value, value, acc);
-
- // Store results
- //VSTOREWITHINCX( xNewPtr, value, incx);
- %VSTOREWITHINCX(xNewPtr, value, incx);
- }
- }
- }
-
-}
-";
-
-
-const char *trsv_CL_ComputeRectangle_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#ifdef PACKED
- #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
-#else
- #define A(row, col) A[ (row) + (col) * lda]
-#endif
-// Compute Rectangle + Traingle
-__kernel void %PREFIXtrsv_CL_ComputeRectangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
-{
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- size_t bIdx = get_group_id(0);
- size_t threadIdx= get_local_id(0);
-
- // Get total blocks launched
- size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1;
-
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
- // Last Block does scalar stuff...
- // Only this gets executed if nBlocks == 1
- if ( bIdx == (nBlocks - 1))
- {
- int targetCol = startCol;
- int startRow = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
- int targetRow = startRow + threadIdx;
- int lastRow = startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
-
- if ( nBlocks > 1)
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol++;
- }
-
- %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
- }
- }
- else // Solve the traingle -- no more kernel launches required
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol++;
- }
- }
-
- // Change targetCol to point to Triangle last column for all threads
- // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
- targetCol = startCol + %TARGET_ROWS;
-
- __local %TYPE xShared; // To share solved x value with other threads..
-
- for( int i=0; i < ((lastRow -startRow) + 1); i++)
- {
- if ( targetRow == targetCol)
- {
- %TYPE xVal = xnew[ targetRow * incx];
- sum = xVal - sum;
-
- if( isUnity)
- {
- xShared = sum;
- }
- else // Handle diagonal element
- {
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
- }
-
- xnew[ targetRow * incx ] = xShared;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( targetRow <= lastRow)
- {
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
- }
-
- // Avoid Race
- barrier(CLK_LOCAL_MEM_FENCE);
- targetCol++;
- }
- }
- }
- else
- {
- size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- int rowStart = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
- int row = rowStart + rowShift;
-
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sum = %VMAKEVEC(sumTemp);
-
- __local %TYPE xData[ %TARGET_WIDTH];
-
- //#pragma unroll
- for( int i=1; i <= %NLOOPS; i++)
- {
- // Put startCol to start of BLOCKSIZE Block
- int startColp = startCol + (%TARGET_WIDTH * (i - 1));
-
- if ( threadIdx < %TARGET_WIDTH)
- {
- xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int col = startColp + colShift;
-
- %TYPE xDataVal = xData[ colShift ];
- %TYPE%V xDataVec= %VMAKEVEC( xDataVal);
-
- %TYPE%V loadedA = %VLOAD( 0, &A((row), (col)));
- %CONJUGATE(doConj, loadedA);
- %VMAD(sum, loadedA, xDataVec);
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //int TARGET_ROWS = %TARGET_ROWS;
-
- // Last Block
- // Do Scalar reduction for last block
- // Followed by solving the triangle
- if ( bIdx == 0 )
- {
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sumVec = %VMAKEVEC(sumTemp);
- %TYPE%V loadedAVec = %VMAKEVEC(sumTemp);
-
- //int targetRow = rowStart + threadIdx;
- int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps
-
- // Do vector reduction
- if ( threadIdx < TARGET_ROWS_BY_VEC )
- {
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
- }
-
- __local %TYPE xShared[%V];
-
- int targetRowTemp = rowStart + threadIdx * %V;
- int VECTOR_SIZE = %V;
-
- //#pragma unroll
- for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
- {
- if ( threadIdx == i )
- {
- // Read X-vector
- %TYPE xVal[%V];
- //#pragma unroll
- for( int j = 0; j < %V; j++)
- {
- xVal[j] = xnew[ (targetRowTemp + j)* incx];
- }
-
- // Read A %Vx%V region into reg
- %TYPE reg[%V][%V];
- //#pragma unroll
- for( int idx = 0; idx < ( %V * %V); idx++)
- {
- int m = idx % ( %V ); // Row : Col-Major idx...
- int n = idx / ( %V ); // Col
- if ( m > n )
- {
- reg[m][n] = A((targetRowTemp + m), (targetCol + n));
- %CONJUGATE(doConj, reg[m][n]);
- }
- }
-
- %TYPE sumVecReg[%V];
- %VSTOREWITHINCX(sumVecReg, sumVec, 1);
-
- // Solve for first x - Do the rest in loop
- %TYPE x[%V];
- %SUB(x[0], xVal[0], sumVecReg[0]);
- xShared[0] = x[0];
- xnew[ (targetRowTemp)* incx ] = x[0];
-
- //#pragma unroll
- for(int m = 1; m < %V; m++)
- {
- %SUB(x[m], xVal[m], sumVecReg[m]);
- }
-
- //#pragma unroll
- for( int idx = %V; idx < (( %V * %V) - 1); idx++)
- {
- int m = idx / %V; // Row : Row-Major idx, x[1] is solved before x[2]
- int n = idx % ( %V );// Col
- if ( m > n)
- {
- //x[m] = x[m] - reg[m][n] * x[n];
- %MAD(x[m], reg[m][n], (-x[n]));
- }
- }
-
- // Store results
- //#pragma unroll
- for(int m = 0; m < %V; m++)
- {
- xShared[m] = x[m];
- xnew[ (targetRowTemp + m)* incx ] = x[m];
- }
- }
-
-
- // Sync so that xShared it available to all threads
- barrier(CLK_LOCAL_MEM_FENCE);
- if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
- {
- //#pragma unroll
- for( int j=0; j < %V; j++)
- {
- %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[j]);
- }
- }
-
- targetCol = targetCol + %V;
- // Avoid Race...
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- else
- {
- // Do Vector Reduction on each block except the last Block
- if ( threadIdx < TARGET_ROWS_BY_VEC)
- {
- %TYPE accTemp = %MAKEVEC(0.0);
- %TYPE%V acc = %VMAKEVEC(accTemp);
-
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
-
- // Store the result
- int targetRow = rowStart + threadIdx * %V;
-
- __global %TYPE* xNewPtr = xnew + targetRow * incx;
- //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
- %TYPE%V value;
- %VLOADWITHINCX(value, xNewPtr, incx);
-
- // Compute result
- %SUB(value, value, acc);
-
- // Store results
- %VSTOREWITHINCX(xNewPtr, value, incx);
- }
- }
- }
-}
-";
-
-const char *trsv_CL_ComputeRectangle_NonUnity_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V))
-#ifdef PACKED
- #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
-#else
- #define A(row, col) A[ (row) + (col) * lda]
-#endif
-// Compute Rectangle + Traingle
-__kernel void %PREFIXtrsv_CL_ComputeRectangle_NonUnity_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx)
-{
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
-
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- size_t bIdx = get_group_id(0);
- size_t threadIdx= get_local_id(0);
-
- // Get total blocks launched
- size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1;
-
- %TYPE sum = %MAKEVEC( 0.0);
- %TYPE loadedA = %MAKEVEC( 0.0);
-
- // Last Block does scalar stuff...
- // Only this gets executed if nBlocks == 1
- if ( bIdx == (nBlocks - 1))
- {
- int targetCol = startCol;
- int startRow = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
- int targetRow = startRow + threadIdx;
- int lastRow = startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1;
-
- if ( nBlocks > 1)
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol++;
- }
-
- %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum);
- }
- }
- else // Solve the traingle -- no more kernel launches required
- {
- if ( targetRow <= lastRow)
- {
- for( int i=0; i < %TARGET_ROWS; i++)
- {
- // All threads look at same xnew
- // Should use Shared Memory ..
- %TYPE xVal = xnew[ targetCol * incx];
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xVal);
- targetCol++;
- }
- }
-
- // Change targetCol to point to Triangle last column for all threads
- // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
- targetCol = startCol + %TARGET_ROWS;
-
- __local %TYPE xShared; // To share solved x value with other threads..
-
- for( int i=0; i < ((lastRow -startRow) + 1); i++)
- {
- if ( targetRow == targetCol)
- {
- %TYPE xVal = xnew[ targetRow * incx];
- sum = xVal - sum;
-
- // Handle diagonal element
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
- xnew[ targetRow * incx ] = xShared;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( targetRow <= lastRow)
- {
- loadedA = A((targetRow), (targetCol));
- %CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
- }
-
- // Avoid Race
- barrier(CLK_LOCAL_MEM_FENCE);
- targetCol++;
- }
- }
- }
- else
- {
- size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V);
- size_t colShift = threadIdx / TARGET_ROWS_BY_VEC;
-
- int rowStart = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS;
- int row = rowStart + rowShift;
-
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sum = %VMAKEVEC(sumTemp);
-
- __local %TYPE xData[ %TARGET_WIDTH];
-
- //#pragma unroll
- for( int i=1; i <= %NLOOPS; i++)
- {
- // Put startCol to start of BLOCKSIZE Block
- int startColp = startCol + (%TARGET_WIDTH * (i - 1));
-
- if ( threadIdx < %TARGET_WIDTH)
- {
- xData[threadIdx] = xnew[ (startColp + threadIdx) * incx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int col = startColp + colShift;
-
- %TYPE xDataVal = xData[ colShift ];
- %TYPE%V xDataVec= %VMAKEVEC( xDataVal);
-
- %TYPE%V loadedA = %VLOAD( 0, &A((row), (col)));
- %CONJUGATE(doConj, loadedA);
- %VMAD(sum, loadedA, xDataVec);
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
- sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //int TARGET_ROWS = %TARGET_ROWS;
-
- // Last Block
- // Do Scalar reduction for last block
- // Followed by solving the triangle
- if ( bIdx == 0 )
- {
- %TYPE sumTemp = %MAKEVEC(0.0);
- %TYPE%V sumVec = %VMAKEVEC(sumTemp);
- %TYPE%V loadedAVec = %VMAKEVEC(sumTemp);
-
- //int targetRow = rowStart + threadIdx;
- int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps
-
- // Do vector reduction
- if ( threadIdx < TARGET_ROWS_BY_VEC )
- {
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
- }
-
- __local %TYPE xShared[%V];
-
- int targetRowTemp = rowStart + threadIdx * %V;
- int VECTOR_SIZE = %V;
-
- //#pragma unroll
- for( int i=0; i < (TARGET_ROWS_BY_VEC); i++)
- {
- if ( threadIdx == i )
- {
- // Read X-vector
- %TYPE xVal[%V];
- //#pragma unroll
- for( int j = 0; j < %V; j++)
- {
- xVal[j] = xnew[ (targetRowTemp + j)* incx];
- }
-
- // Read A %Vx%V region into reg
- %TYPE reg[%V][%V];
- //#pragma unroll
- for( int idx = 0; idx < ( %V * %V); idx++)
- {
- int m = idx % ( %V ); // Row : Col-Major idx...
- int n = idx / ( %V ); // Col
- if ( m >= n )
- {
- reg[m][n] = A((targetRowTemp + m), (targetCol + n));
- %CONJUGATE(doConj, reg[m][n]);
- }
- }
-
- %TYPE sumVecReg[%V];
- %VSTOREWITHINCX(sumVecReg, sumVec, 1);
-
- // Solve for first x - Do the rest in loop
- %TYPE x[%V];
- %SUB(x[0], xVal[0], sumVecReg[0]);
- %DIV(sumVecReg[0], x[0], reg[0][0]);
- x[0] = sumVecReg[0];
- xShared[0] = sumVecReg[0];
- xnew[ (targetRowTemp)* incx ] = sumVecReg[0];
-
- //#pragma unroll
- for(int m = 1; m < %V; m++)
- {
- %SUB(x[m], xVal[m], sumVecReg[m]);
- }
-
- //#pragma unroll
- for( int idx = %V; idx < (%V * %V); idx++)
- {
- int m = idx / %V; // Row : Row-Major idx, x[1] is solved before x[2]
- int n = idx % ( %V );// Col
- if ( m > n)
- {
- //x[m] = x[m] - reg[m][n] * x[n];
- %MAD(x[m], reg[m][n], (-x[n]));
- }
- else if ( m == n)
- {
- %DIV(sumVecReg[m], x[m], reg[m][m]);
- x[m] = sumVecReg[m];
- }
- }
-
- // Store results
- //#pragma unroll
- for(int m = 1; m < %V; m++)
- {
- xShared[m] = x[m];
- xnew[ (targetRowTemp + m)* incx ] = x[m];
- }
- }
-
-
- // Sync so that xShared it available to all threads
- barrier(CLK_LOCAL_MEM_FENCE);
- if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
- {
- //#pragma unroll
- for( int j=0; j < %V; j++)
- {
- %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[j]);
- }
- }
-
- targetCol = targetCol + %V;
- // Avoid Race...
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- else
- {
- // Do Vector Reduction on each block except the last Block
- if ( threadIdx < TARGET_ROWS_BY_VEC)
- {
- %TYPE accTemp = %MAKEVEC(0.0);
- %TYPE%V acc = %VMAKEVEC(accTemp);
-
- //#pragma unroll
- for( int j=0; j < %TARGET_WIDTH; j++)
- {
- %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]);
- }
-
- // Store the result
- int targetRow = rowStart + threadIdx * %V;
-
- __global %TYPE* xNewPtr = xnew + targetRow * incx;
- //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]);
- %TYPE%V value;
- %VLOADWITHINCX(value, xNewPtr, incx);
-
- // Compute result
- %SUB(value, value, acc);
-
- // Store results
- %VSTOREWITHINCX(xNewPtr, value, incx);
- }
- }
- }
-}
-";
-
-
-const char *trsv_CUT_ComputeRectangle_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#ifdef PACKED
- #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row))))
-#else
- #define A( row, col) A[ (row) + (col) * lda]
-#endif
-__kernel void %PREFIXtrsv_CUT_ComputeRectangle_kernel(__global const %TYPE* _A, __global %TYPE* _xnew,
- uint N,
- int incx,
- int isUnity,
- uint lda,
- int doConj,
- int startRow, int endRow, uint offa, uint offx)
-{
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- int threadID = get_local_id(0);
- int threadID_Y, threadID_X;
- int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y;
- int blkid = get_group_id(0);
- int V= %V;
-
- __local %TYPE solved[%TRIANGLE_HEIGHT];
- __local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT];
- __local %TYPE%V *solved_vec;
- int blockStartRow;
- int triangleHeight;
- %TYPE%V acc;
- %TYPE%V loadedAVec;
- %TYPE sacc;
- %TYPE accTemp;
-
- triangleHeight = endRow - startRow;
-/*
- if ((triangleHeight != %TRIANGLE_HEIGHT) || ((triangleHeight % V) != 0))
- {
- // throw -1;
-
- //
- // It is the caller's responsibility to solve triangle whose width
- // is a multiple of VECTOR SIZE before calling this routine.
- // This makes the width of the rectangle to be multiple of VECTOR SIZE.
- // Thus threads can iterate without looking out for vector-unfriendly
- // dimensions.
- // This condition can be maintained for any dimension of the input matrix
- // So, generality is not broken here.
- //
- *(__global int*)0 = 0;
- }
-
- if (( %BLOCKSIZE % %TARGET_HEIGHT) != 0)
- {
- // throw -1;
-
- //
- // Awkward Block Size. Impossible to write neat code.
- // The set of threads belonging to the last threadID_X will not have
- // blockSize_Y number of threads.
- //
- *(__global int*)0 = 0;
- }
-*/
- blockSize_y = %TARGET_HEIGHT;
- blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT;
-
- threadID_Y = threadID % %TARGET_HEIGHT;
- threadID_X = threadID / %TARGET_HEIGHT;
-
- blockStartRow = endRow + (blkid * blockSize_x);
- blockStartRow += threadID_X;
-
- for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize)
- {
- solved[i] = xnew[(startRow + i)*incx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- solved_vec = solved;
- accTemp = %INIT(0.0);
- acc = %VMAKEVEC( accTemp);
-
- if (blockStartRow < N)
- {
- for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y)
- {
- loadedAVec = %VLOAD(0, &A((startRow + i*V), (blockStartRow)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well.
- }
- sacc = %REDUCE_SUM(acc);
-
- // Put stuff in shared memory for final reduction
- reduce[threadID_Y][threadID_X] = sacc;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( threadID < blockSize_x)
- {
- sacc = %INIT(0.0);
- //#pragma unroll
- for( int i=0; i < %TARGET_HEIGHT; i++)
- {
- %ADD(sacc, sacc, reduce[i][threadID]);
- }
-
- blockStartRow = endRow + (blkid * blockSize_x);
- blockStartRow += threadID;
- if ( blockStartRow < N)
- {
- %SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc);
- }
- }
-}
-";
-
-const char *trsv_CLT_ComputeRectangle_kernel="
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-#ifdef PACKED
- #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col)))))
-#else
- #define A(row, col) A[ (row) + (col) * lda]
-#endif
-__kernel void %PREFIXtrsv_CLT_ComputeRectangle_kernel( __global const %TYPE* _A,
- __global %TYPE* _xnew,
- uint N,
- int incx,
- int isUnity,
- uint lda,
- int doConj,
- int startRow, int endRow, uint offa, uint offx)
-{
-
- __global %TYPE* xnew;
- __global %TYPE* A = _A + offa;
- if ( incx < 0 ) // Goto end of vector
- {
- xnew = _xnew + offx + ( N - 1) * abs(incx);
- }
- else
- {
- xnew = _xnew + offx;
- }
-
- int threadID = get_local_id(0);
- int threadID_Y, threadID_X;
- int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y;
- int blkid = get_group_id(0);
- int V= %V;
-
- __local %TYPE solved[%TRIANGLE_HEIGHT];
- __local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT];
- __local %TYPE%V *solved_vec;
- int blockStartRow;
- int triangleHeight;
- %TYPE%V acc;
- %TYPE%V loadedAVec;
- %TYPE sacc;
- %TYPE accTemp;
-
- triangleHeight = endRow - startRow;
-
- blockSize_y = %TARGET_HEIGHT;
- blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT;
-
- threadID_Y = threadID % %TARGET_HEIGHT;
- threadID_X = threadID / %TARGET_HEIGHT;
-
- blockStartRow = startRow - 1 - (blkid * blockSize_x);
- blockStartRow -= threadID_X;
-
- for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize)
- {
- solved[i] = xnew[(startRow + i)*incx];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- solved_vec = solved;
- accTemp = %INIT(0.0);
- acc = %VMAKEVEC( accTemp);
-
- if (blockStartRow >= 0)
- {
- for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y)
- {
- loadedAVec = %VLOAD(0, &A((startRow+ i*V) , (blockStartRow)));
- %CONJUGATE(doConj, loadedAVec);
- %VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well.
- }
- sacc = %REDUCE_SUM(acc);
-
- // Put stuff in shared memory for final reduction
- reduce[threadID_Y][threadID_X] = sacc;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if ( threadID < blockSize_x)
- {
- sacc = %INIT(0.0);
- //#pragma unroll
- for( int i=0; i < %TARGET_HEIGHT; i++)
- {
- %ADD(sacc, sacc, reduce[i][threadID]);
- }
-
- blockStartRow = startRow - 1 - (blkid * blockSize_x);
- blockStartRow -= threadID;
- if ( blockStartRow < N)
- {
- %SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc);
- }
- }
-}
-";
-