summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl')
-rw-r--r--external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl292
1 files changed, 0 insertions, 292 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl b/external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl
deleted file mode 100644
index 3cdadae7..00000000
--- a/external/clBLAS/src/library/blas/gens/clTemplates/gbmv.cl
+++ /dev/null
@@ -1,292 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-// Row-Major Non-transpose case
-static const char *gbmv_RNT_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-#define H ( %DEF_H )
-#define TARGET_ROWS ( %DEF_TARGET_ROWS )
-
-__kernel void %PREFIXgbmv_RNT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector,
- uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy
-#ifndef TBMV_ONLY
- ,%TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global const %TYPE * A;
- __global %TYPE const* restrict X;
- __global %TYPE* Y;
- __local %TYPE localRed[ (H+1) * TARGET_ROWS ]; // To avoid bank-conflict
-
- A = _A + offa;
- if ( incx < 0 ) // Goto end of vector
- X = _x_vector + offx + ( N - 1 ) * abs(incx);
- else
- X = _x_vector + offx;
-
- if( incy < 0 )
- Y = _y_vector + offy + ( M - 1 ) * abs(incy);
- else
- Y = _y_vector + offy;
-
- int gId = get_group_id( 0 );
- int lId = get_local_id( 0 );
- int threadRow = (lId / H);
- int threadCol = (lId % H);
- int row = ( gId * TARGET_ROWS ) + threadRow;
- int AStartColIndex = max( (int)(KL-row), 0 );
- int XStartIndex = ( row <= KL )? 0: (int)(row-KL);
- bool diagPresent = ( row < N ) ? true: false;
- int numSubDiags = min( row, max( 0, min( ((int)min( KL, N )), (int)(N+KL-row) ) ) );
- int numSupDiags = max( 0, min( (int)KU, (int)(N-1-row) ) );
- %TYPE reg1, reg2, sum;
-
- if( row < M )
- {
- sum = %MAKEVEC(0.0);
- localRed[ lId ] = %MAKEVEC(0.0);
- // Sub-diagonal iteration
- #ifdef GIVEN_SHBMV_UPPER
- int symmStartRow = max( 0, (row - (int)KU) ); // row - (BW-1) = KU
- int symmStartCol = min( (int)KU, row ); // row - (BW-1) = KU
- #endif
- for( int i=threadCol; i<numSubDiags; i+= H )
- {
- #ifdef GIVEN_SHBMV_UPPER
- reg1 = A[ ((symmStartRow+i) * lda) + (symmStartCol - i) ];
- %CONJUGATE(1 , reg1); // Hermitian transpose- will be ignored for real cases
- #else
- reg1 = A[ (row * lda) + (AStartColIndex + i) ];
- #endif
-
- #ifdef DO_CONJ
- %CONJUGATE(1 , reg1);
- #endif
- reg2 = X[ (XStartIndex + i) * incx ];
- %MAD( sum, reg1, reg2 );
- }
- #ifdef GIVEN_SHBMV_UPPER
- AStartColIndex = 0;
- #else
- AStartColIndex += numSubDiags;
- #endif
- XStartIndex += numSubDiags;
-
- // Calculate diagonal component -- only by first thread of the row
- if( diagPresent )
- {
- if( threadCol == 0 )
- {
- reg2 = X[ XStartIndex * incx ];
- #ifndef UNIT_DIAG
- reg1 = A[ (row * lda) + AStartColIndex ];
- #ifdef DO_CONJ
- %CONJUGATE(1 , reg1);
- #endif
- #ifdef HBMV_ONLY
- reg1.odd = 0.0; // Imaginary part of diagonal is assumed to be zero
- #endif
- %MAD( sum, reg1, reg2 );
- #else
- sum += reg2;
- #endif
- }
- AStartColIndex ++;
- XStartIndex ++;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if( row < M )
- {
- // Super-diagonal iteration
- #ifdef GIVEN_SHBMV_LOWER
- AStartColIndex = (KL+1) - 2; // KL+1 is BW
- #endif
-
- for( int i=threadCol; i<numSupDiags; i+= H )
- {
- #ifdef GIVEN_SHBMV_LOWER
- reg1 = A[ ((row+i+1) * lda) + (AStartColIndex - i) ];
- %CONJUGATE(1 , reg1); // Hermitian transpose- will be ignored for real cases
- #else
- reg1 = A[ (row * lda) + (AStartColIndex + i) ];
- #endif
-
- #ifdef DO_CONJ
- %CONJUGATE(1 , reg1);
- #endif
- reg2 = X[ (XStartIndex + i) * incx ];
- %MAD( sum, reg1, reg2 );
- }
- localRed[ (threadRow * (H+1)) + threadCol ] = sum;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // Update the Y vector
- if( (threadCol == 0) && (row < M) )
- {
- sum = %MAKEVEC(0.0);
- for( int i=0; i<H; i++ )
- {
- %ADD( sum, sum, localRed[ (threadRow * (H+1)) + i ] );
- }
- #ifndef TBMV_ONLY
- %MUL( reg1, alpha, sum );
- %MUL( reg2, beta, Y [ row * incy ] );
- %ADD( Y[ row * incy ], reg1, reg2 );
- #else
- Y[ row * incy ] = sum;
- #endif
- }
-
-}
-";
-
-
-
-// Row-Major Transpose case
-static const char *gbmv_RT_kernel = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-#define TARGET_ROWS ( %DEF_TARGET_ROWS )
-#define HEIGHT ( %DEF_H)
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-__kernel void %PREFIXgbmv_RT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector,
- uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy
-#ifndef TBMV_ONLY
- ,%TYPE alpha, %TYPE beta
-#endif
- )
-{
- __global const %TYPE * A = _A + offa;
- __global %TYPE const * restrict X;
- __global %TYPE *Y;
-
- if ( incx < 0 ) // Goto end of X vector
- {
- X = _x_vector + offx + ( M - 1) * abs(incx);
- }
- else
- {
- X = _x_vector + offx;
- }
-
- if( incy < 0 ) // Goto end of Y vector
- {
- Y = _y_vector + offy + ( N - 1) * abs(incy);
- }
- else
- {
- Y = _y_vector + offy;
- }
-
- int blkID, thrID;
- int blkColIndx, blkStrtCol, blkStrtRow, blkOffset;
- int thrRow, thrCol;
- int bandWidth = KL + KU + 1;
-
- blkID = get_group_id(0);
- thrID = get_local_id(0);
-
- //Find the block start column and start row.
- blkOffset = blkID * HEIGHT;
- blkColIndx = (blkOffset) + KL;
- blkStrtCol = (blkColIndx >= bandWidth) ? (bandWidth - 1) : blkColIndx;
- blkStrtRow = ((blkColIndx - (bandWidth - 1)) < 0) ? 0 : (blkColIndx - (bandWidth - 1));
- %TYPE thrSum = %MAKEVEC(0.0); //Private sum for each thread
- %TYPE reg1, reg2;
-
- if(((blkOffset) + (thrID % HEIGHT)) < N)
- {
- thrRow = blkStrtRow + ((int)thrID / (HEIGHT));
- thrCol = blkStrtCol + ((int)thrID % (HEIGHT)) - ((int)thrID / (HEIGHT));
- while((thrRow < M) && (thrCol >= 0))
- {
- if(thrCol < bandWidth)
- {
- reg2 = X[ thrRow * incx];
- #ifdef UNIT_DIAG
- if(thrCol == ((int)KL))
- {
- thrSum += reg2;
- }
- else
- {
- reg1 = A[(thrRow*lda) + thrCol];
- #ifdef DO_CONJ
- %CONJUGATE(1 , reg1);
- #endif
- %MAD(thrSum, reg1, reg2);
- }
- #else
- reg1 = A[(thrRow*lda) + thrCol];
- #ifdef DO_CONJ
- %CONJUGATE(1 , reg1);
- #endif
- %MAD(thrSum, reg1, reg2);
- #endif
- //thrSum += A[(thrRow*lda) + thrCol ] * X[ thrRow ];
- }
- thrRow += TARGET_ROWS;
- thrCol -= TARGET_ROWS;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //Store the results in a temporary local buffer and accumulate the same.
- __local %TYPE sum[(TARGET_ROWS * HEIGHT)];
- sum[thrID] = thrSum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if((thrID < HEIGHT) && ((blkOffset + (thrID % HEIGHT)) < N))
- {
- int iY = (blkOffset) + thrID;
- %TYPE tempSum = %MAKEVEC(0.0);
- for(int i = 0; i < TARGET_ROWS; i++)
- {
- reg1 = sum[thrID + (i * HEIGHT)];
- %ADD(tempSum, tempSum, reg1);
- //tempSum += sum[thrID + (i * HEIGHT)];
- }
- #ifndef TBMV_ONLY
- %MUL(reg1, alpha, tempSum);
- %MUL(reg2, beta, Y[iY * incy]);
- %ADD(Y[iY * incy], reg1, reg2);
- #else
- Y[ iY * incy ] = tempSum;
- #endif
- //Y[iY] = ((alpha * tempSum) + (beta * Y[iY]));
- }
-}
-";
-
-