diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/trsv.cl')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/clTemplates/trsv.cl | 437 |
1 files changed, 0 insertions, 437 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/trsv.cl b/external/clBLAS/src/library/blas/gens/clTemplates/trsv.cl deleted file mode 100644 index 56da9c86..00000000 --- a/external/clBLAS/src/library/blas/gens/clTemplates/trsv.cl +++ /dev/null @@ -1,437 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -// TRSV Column-Major Upper Kernel -//#include <TRSV.h> - - -const char * trsv_CU_SolveTriangle_kernel = " -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#ifdef PACKED - #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) -#elif defined(BANDED) - #define A( row, col) A[ (row) * lda + (col)] -#else - #define A( row, col) A[ (row) + (col) * lda] -#endif -// Only one workgroup of threads launched -__kernel void %PREFIXtrsv_CU_SolveTriangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, - uint lda, int doConj, int startRow, int startCol, uint offa, uint offx -#ifdef BANDED - , uint KU -#endif - ) -{ - __global %TYPE* xnew; - __global %TYPE const * restrict A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - xnew = _xnew + offx + ( N - 1) * abs(incx); - } - else - { - xnew = _xnew + offx; - } - - __local %TYPE xShared; // To share solved x value with other threads.. - - size_t gIdx = get_global_id(0); - size_t bIdx = get_group_id(0); - size_t threadIdx= get_local_id(0); - - %TYPE sum = %MAKEVEC(0.0); - %TYPE xVal = %MAKEVEC(0.0); - %TYPE loadedA = %MAKEVEC(0.0); - - int targetCol = startCol; - int targetRow = startRow + threadIdx; - int loops = (startCol - startRow) + 1; - -#ifdef BANDED - int bandCol = (loops - 1) - threadIdx; -#endif - - for( int i=0; i < loops; i++) - { - if ( targetRow == targetCol) - { - xVal = xnew[ targetRow * incx]; - %SUB(sum, xVal, sum); - - if( isUnity) - { - xShared = sum; - } - else // Handle diagonal element - { -#ifdef BANDED - loadedA = A((targetRow), (bandCol)); -#else - loadedA = A((targetRow), (targetCol)); -#endif - %CONJUGATE(doConj, loadedA); - %DIV(xShared, sum, loadedA); - } - xnew[ targetRow * incx ] = xShared; - } - // Sync so that xShared it available to all threads - barrier(CLK_LOCAL_MEM_FENCE); - - if ( targetRow < targetCol) - { -#ifdef BANDED - loadedA = A((targetRow), (bandCol)); -#else - loadedA = A((targetRow), (targetCol)); -#endif - %CONJUGATE(doConj, loadedA); - %MAD(sum, loadedA, xShared); - } - - // Avoid Race... - barrier(CLK_LOCAL_MEM_FENCE); - targetCol--; -#ifdef BANDED - bandCol--; -#endif - } -}"; - - -const char * trsv_CL_SolveTriangle_kernel = " -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif -#ifdef PACKED - #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) -#elif defined(BANDED) - #define A(row, col) A[ (row) * lda + (col)] -#else - #define A(row, col) A[ (row) + (col) * lda] -#endif -#pragma OPENCL EXTENSION cl_amd_printf : enable -// Only one block of threads launched -__kernel void %PREFIXtrsv_CL_SolveTriangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, - uint lda, int doConj, int startCol, int endRow, uint offa, uint offx -#ifdef BANDED - , uint KL -#endif - ) -{ - __global %TYPE* xnew; - __global %TYPE* A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - xnew = _xnew + offx + ( N - 1) * abs(incx); - } - else - { - xnew = _xnew + offx; - } - - __local %TYPE xShared; // To share solved x value with other threads.. - - size_t gIdx = get_global_id(0); - size_t bIdx = get_group_id(0); - size_t threadIdx= get_local_id(0); - - %TYPE sum = %MAKEVEC(0.0); - %TYPE xVal = %MAKEVEC(0.0); - %TYPE loadedA = %MAKEVEC(0.0); - - int targetCol = startCol; - int targetRow = endRow - threadIdx; - int loops = (endRow - startCol) + 1; -#ifdef BANDED - int bandCol = (KL + 1) - loops + threadIdx; -#endif - -// printf(\"%u : bandCol %d targetCol %d targetRow %d loops %d KL %d\\n\", threadIdx, bandCol, targetCol, targetRow, loops, KL); - - for( int i=0; i < loops; i++) - { - if ( targetRow == targetCol) - { - xVal = xnew[ targetRow * incx]; - //printf(\"Before1 %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum); - %SUB(sum, xVal, sum); - //printf(\"Before2 %u : xShared %f, sum %f XvAL %f, targetRow %d\\n\", threadIdx, xShared, sum, xVal, targetRow); - - if( isUnity) - { - xShared = sum; - } - else // Handle diagonal element - { -#ifndef BANDED - loadedA = A((targetRow), (targetCol)); -#else - loadedA = A((targetRow), (bandCol)); -#endif - %CONJUGATE(doConj, loadedA); - %DIV(xShared, sum, loadedA); - } - //printf(\"After %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum); - xnew[ targetRow * incx ] = xShared; - } - // Sync so that xShared it available to all threads - barrier(CLK_LOCAL_MEM_FENCE); - - if ( targetRow > targetCol) - { -#ifndef BANDED - loadedA = A((targetRow), (targetCol)); -#else - loadedA = A((targetRow), (bandCol)); -#endif - %CONJUGATE(doConj, loadedA); - //printf(\"%u : xShared %f, sum %f loadedA %f\\n\", threadIdx, xShared, sum, loadedA); - %MAD(sum, loadedA, xShared); - } - - // Avoid Race... - barrier(CLK_LOCAL_MEM_FENCE); - targetCol++; -#ifdef BANDED - bandCol++; -#endif - } -} -"; - -const char * trsv_CUT_SolveTriangle_kernel = " -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif -#ifdef PACKED - #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) -#elif defined(BANDED) - #define A(row, col) A[ (row) * lda + (col)] -#else - #define A( row, col) A[ (row) + (col) * lda] -#endif -#pragma OPENCL EXTENSION cl_amd_printf : enable -__kernel void %PREFIXtrsv_CUT_SolveTriangle_kernel( __global const %TYPE* _A, - __global %TYPE* _xnew, - uint N, - int incx, - int isUnity, - uint lda, - int doConj, - int startRow, int endRow, uint offa, uint offx -#ifdef BANDED - , uint KU -#endif - ) -{ - __global %TYPE* xnew; - __global const %TYPE* A = _A + offa; - if ( incx < 0 ) // Goto end of vector - { - xnew = _xnew + offx + ( N - 1) * abs(incx); - } - else - { - xnew = _xnew + offx; - } - - int blockSize = get_local_size(0); - int threadID = get_local_id(0); - int targetRow; -#ifdef BANDED - int bandRow = startRow; - int bandCol = threadID; -// printf(\"threadID %d, bandRow %d bandCol %d\\n\",threadID, bandRow, bandCol); -#endif - __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT]; - - targetRow = startRow + threadID; - //#pragma unroll - for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && ((startRow + idx) < endRow); idx += blockSize) - { - saccShared[idx] = xnew[ (startRow + idx) * incx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - %TYPE diagA = %INIT(0.0); - if(targetRow < endRow) - { -#ifndef BANDED - diagA = A((targetRow), (targetRow)); -#else - diagA = A((startRow + threadID), (0)); -#endif - %CONJUGATE(doConj, diagA); - } - %TYPE tempA, tempS; - for(int i = 0; i < %TRIANGLE_HEIGHT; i++) - { - if((i <= threadID) && (i > 0) && (targetRow < endRow)) - { -#ifndef BANDED - tempA = A((startRow + i - 1), (targetRow)); -#else - tempA = A((bandRow - 1), (bandCol + 1)); - // printf(\"threadID %d, bandRow %d bandCol %d A %f\\n\",threadID, bandRow, bandCol, tempA); -#endif - %CONJUGATE(doConj, tempA); - %MUL(tempS, tempA, saccShared[i-1]); - %SUB(saccShared[threadID], saccShared[threadID], tempS); - } - if((i == threadID) && (targetRow < endRow) && (!isUnity)) - { - tempS = saccShared[threadID]; - // printf(\"threadID %d, saccShared %f, diagA %f\\n\", threadID, tempS, diagA); - %DIV(saccShared[threadID], tempS, diagA); - } - barrier(CLK_LOCAL_MEM_FENCE); -#ifdef BANDED - bandRow++; bandCol--; -#endif - } - barrier(CLK_LOCAL_MEM_FENCE); - if(targetRow < endRow) - { - xnew[(targetRow * incx)] = saccShared[threadID]; - } -} -"; - -const char * trsv_CLT_SolveTriangle_kernel = " -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif -#ifdef PACKED - #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) -#elif defined(BANDED) - #define A(row, col) A[ (row) * lda + (col)] -#else - #define A(row, col) A[ (row) + (col) * lda] -#endif -#pragma OPENCL EXTENSION cl_amd_printf : enable - -// Column-Major Lower Non-Unity case -// StartRow points to actual Row to start from( absolute Column number) -// endRow points to actual Row to stop + 1( absolute Column number) -__kernel void %PREFIXtrsv_CLT_SolveTriangle_kernel( __global const %TYPE* _A, - __global %TYPE* _xnew, - uint N, - int incx, - int isUnity, - uint lda, - int doConj, - int startRow, int endRow, uint offa, uint offx -#ifdef BANDED - ,uint KL -#endif - ) -{ - __global %TYPE* xnew; - __global const %TYPE *A = _A + offa; - if ( incx < 0 ) // Goto end of vector - { - xnew = _xnew + offx + ( N - 1) * abs(incx); - } - else - { - xnew = _xnew + offx; - } - - int blockSize = get_local_size(0); - int threadID = get_local_id(0); - __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT]; - int targetRow; - targetRow = (endRow - 1) - threadID; - -#ifdef BANDED - int bandRow = (endRow - 1); - int bandCol = (KL) - threadID; -#endif - - //#pragma unroll - for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && (((endRow - 1) - idx) >= startRow); idx += blockSize) - { - saccShared[idx] = xnew[((endRow - 1) - idx) * incx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - %TYPE diagA = %INIT(0.0); - if(targetRow >= startRow) - { -#ifndef BANDED - diagA = A((targetRow), (targetRow)); -#else - diagA = A((bandRow - threadID), (KL)); -// printf(\"ThreadID %d, bandRow %d bandCol %d\\n\", threadID, bandRow, bandCol); -#endif - %CONJUGATE(doConj, diagA); - } - %TYPE tempA, tempS; - - for( int i = (endRow - 1); i >= startRow; i--) - { - if((targetRow == i) && (!isUnity)) - { - tempS = saccShared[threadID]; - %DIV(saccShared[threadID], tempS, diagA); - } - barrier(CLK_LOCAL_MEM_FENCE); - if((targetRow < i) && (targetRow >= startRow)) - { -#ifndef BANDED - tempA = A((i), (targetRow)); -#else - tempA = A((bandRow), (bandCol)); -#endif - %CONJUGATE(doConj, tempA); - %MUL(tempS, tempA, saccShared[(endRow - 1) - i]); - %SUB(saccShared[threadID], saccShared[threadID], tempS); - } - barrier(CLK_LOCAL_MEM_FENCE); -#ifdef BANDED - bandRow--; bandCol++; -#endif - } - barrier(CLK_LOCAL_MEM_FENCE); - if(targetRow >= startRow) - { - xnew[(targetRow * incx)] = saccShared[threadID]; - } -} -"; - |