diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/her2.cl')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/clTemplates/her2.cl | 662 |
1 files changed, 0 insertions, 662 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/her2.cl b/external/clBLAS/src/library/blas/gens/clTemplates/her2.cl deleted file mode 100644 index ae8f92df..00000000 --- a/external/clBLAS/src/library/blas/gens/clTemplates/her2.cl +++ /dev/null @@ -1,662 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -/************************************************/ -//NOTE: THIS FILE IS NOT USED. SEE SYR2_HER2.CLT -// THIS FILE IS FOR LEGACY PURPOSES. - -//Column Major Lower -static const char *her2_CL_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// Column-Major Lower -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXher2_CL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - if( (alpha.even == 0.0) && (alpha.odd == 0.0) ) - return; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - - - int ref_x = i * %TARGET_ROWS; - int ref_y = j * %TARGET_ROWS; - %TYPE conjAlpha = alpha; - %CONJUGATE( 1, conjAlpha ); - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( i == j) - { - int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y); - int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x); - int nElements = ((nrows) * ((ncols) + 1)) >> 1; - nrows -= 1; - ncols -= 1; - for(i = threadID; i < nElements; i += get_local_size(0)) - { - int r = -1, c = -1; - for(int k = 1; (k <= %TARGET_ROWS); k ++) - { - int temp = ((k - 1) * k) >> 1; - r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r; - } - c = i - (((r + 1) * r) >> 1); - - r = ref_x + r; - c = ref_y + c; - - %TYPE res1, res2, res3, res4, res5; - res1 = X[r * incx]; - res2 = X[c * incx]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, res1); - #endif - %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, res1); - #endif - %MUL( res4, res5, res1 ); - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, res3); - #else - %CONJUGATE(1, res2); - #endif - %MUL( res5, conjAlpha, res3 ); - %MAD( res4, res5, res2 ); - res1 = A[r + c * lda]; - %ADD( res2, res1, res4 ); - /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */ - - res2.odd = (r == c) ? 0.0 : res2.odd; - - - A[r + c * lda] = res2; - } - } - else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region) - { - - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x + threadID); i < N; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xShared[i - ref_x] = loadedX; - yShared[i - ref_x] = loadedY; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xSharedConj[(i - ref_y) ] = loadedX; - ySharedConj[(i - ref_y) ] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - - int startRow = ref_x + rowShift; - %TYPE%V loadedX, loadedY; - - if ( startRow < (N - (%V - 1)) ) - { - loadedX= *((__local %TYPE%V*)( xShared + rowShift)); - loadedY= *((__local %TYPE%V*)( yShared + rowShift)); - } - - for( int i= 1; i <= nLoops; i++) - { - int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH; - - if ( ( startRow < N ) && ( startCol < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region - { - if(( startRow + %V) > N )// Loop serially as can't do VLOAD - { - %TYPE yValueConj = ySharedConj[ startCol - ref_y]; - %TYPE xValueConj = xSharedConj[ startCol - ref_y]; - - for(int row = startRow; row < N; row++) - { - %TYPE xValue = xShared[ row - ref_x]; - %TYPE yValue = yShared[ row - ref_x]; - - %TYPE res1, res2; - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, conjAlpha, xValueConj); - %MAD( res2, res1, yValue); - A[ row + startCol * lda] += res2; - } - } - else - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - - %TYPE loadedYConj = ySharedConj[ startCol - ref_y]; - %TYPE loadedXConj = xSharedConj[ startCol - ref_y]; - %TYPE res; - - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, conjAlpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - } - else // blocks that fit exactly. - { - - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xShared[i - ref_x] = loadedX; - yShared[i - ref_x] = loadedY; - } - - // Populating yShared - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xSharedConj[i - ref_y] = loadedX; - ySharedConj[i - ref_y] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - int startRow = ref_x + rowShift; - int startCol = ref_y + colShift; - %TYPE%V loadedX, loadedY; - - if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region - { - loadedX = *((__local %TYPE%V*)( xShared + rowShift)); - loadedY = *((__local %TYPE%V*)( yShared + rowShift)); - } - - //#pragma unroll - for( int i= 1; i <= nLoops; i++) - { - startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH; - - if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - %TYPE loadedYConj = ySharedConj[ startCol - ref_y]; - %TYPE loadedXConj = xSharedConj[ startCol - ref_y]; - - // X * Y(H) - %TYPE res; - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, conjAlpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - -} -\n"; - -//Column Major Upper -static const char *her2_CU_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// Column-Major Upper -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXher2_CU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - if( (alpha.even == 0.0) && (alpha.odd == 0.0) ) - return; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - //for ( _j = 0; _j < nBlocks; _j++) - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - %TYPE conjAlpha = alpha; - %CONJUGATE( 1, conjAlpha ); - - int ref_x = (N- 1) - i * %TARGET_ROWS; - int ref_y = (N- 1) - j * %TARGET_ROWS; - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( i == j) - { - int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1); - int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1); - int nElements = ((nrows) * ((ncols) + 1)) >> 1; - nrows -= 1; - ncols -= 1; - for(i = threadID; i < nElements; i += get_local_size(0)) - { - int r, c = -1; - for(int k = 1; (k <= %TARGET_ROWS); k ++) - { - int temp = ((k - 1) * k) >> 1; - c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c; - } - r = i - (((c + 1) * c) >> 1); - - r = ref_x - (nrows) + r; - c = ref_y - (ncols) + c; - - %TYPE res1, res2, res3, res4, res5; - res1 = X[r * incx]; - res2 = X[c * incx]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, res1); - #endif - %MUL( res5, alpha, res1 ); - res1 = Y[c * incx]; - res3 = Y[r * incx]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, res1); - #endif - %MUL( res4, res5, res1 ); - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, res3); - #else - %CONJUGATE(1, res2); - #endif - %MUL( res5, conjAlpha, res3 ); - %MAD( res4, res5, res2 ); - res1 = A[r + c * lda]; - %ADD( res2, res1, res4 ); - /* HER2 defn: On output, if alpha not equal to 0.0, then imaginary part of A is set to zero. */ - - res2.odd = (r == c) ? 0.0 : res2.odd; - - - A[r + c * lda] = res2; - } - } - else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region) - { - - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xShared[(%TARGET_ROWS - 1) - threadID] = loadedX; - yShared[(%TARGET_ROWS - 1) - threadID] = loadedY; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xSharedConj[(ref_y - i)] = loadedX; - ySharedConj[(ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - int startRow = ref_x - rowShift; - %TYPE%V loadedX, loadedY; - - if ( startRow >= 0 ) - { - loadedX= *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift])); - loadedY= *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1) - rowShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH; - - // threads that fall into target region - if( ( startRow > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) ) - { - if( startRow < 0 )// Loop serially as can't do VLOAD - { - %TYPE yValueConj = ySharedConj[ ref_y - startCol]; - %TYPE xValueConj = xSharedConj[ ref_y - startCol]; - - for(int row = startRow + (%V - 1); row >= 0; row--) - { - %TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)]; - %TYPE yValue = yShared[ %TARGET_ROWS - 1 - (ref_x - row)]; - - %TYPE res1, res2; - - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, conjAlpha, xValueConj); - %MAD( res2, res1, yValue); - A[ row + startCol * lda] += res2; - } - } - else - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - - %TYPE loadedXConj = xSharedConj[ ref_y - startCol]; - %TYPE loadedYConj = ySharedConj[ ref_y - startCol]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, conjAlpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - } - else // blocks that fit exactly. - { - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifdef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedX; - yShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = loadedY; - } - - // Populating yShared - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - #ifndef HER2_ROWMAJOR - %CONJUGATE(1, loadedX); //taking conjugate while loading - %CONJUGATE(1, loadedY); - #endif - xSharedConj[(ref_y - i)] = loadedX; - ySharedConj[(ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - - int startRow = ref_x - rowShift; - int startCol = ref_y - colShift; - %TYPE%V loadedX, loadedY; - // Not all threads should do this.. - // Depends on whether blocksize width is > target_rows - if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region - { - loadedX= *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift])); - loadedY= *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1)- rowShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH; - - if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - %TYPE loadedYConj = ySharedConj[ ref_y - startCol]; - %TYPE loadedXConj = xSharedConj[ ref_y - startCol]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, conjAlpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - - } -} -\n"; |