diff options
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/syr2.cl')
-rw-r--r-- | external/clBLAS/src/library/blas/gens/clTemplates/syr2.cl | 1209 |
1 files changed, 0 insertions, 1209 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/syr2.cl b/external/clBLAS/src/library/blas/gens/clTemplates/syr2.cl deleted file mode 100644 index 0846b2d8..00000000 --- a/external/clBLAS/src/library/blas/gens/clTemplates/syr2.cl +++ /dev/null @@ -1,1209 +0,0 @@ -/* ************************************************************************ - * Copyright 2013 Advanced Micro Devices, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ************************************************************************/ - -/************************************************/ -//NOTE: THIS FILE IS NOT USED. SEE SYR2_HER2.CLT -// THIS FILE IS FOR LEGACY PURPOSES. - -//Column Major Lower -static const char *syr2_CL_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// Column-Major Lower -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXsyr2_CL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - - - int ref_x = i * %TARGET_ROWS; - int ref_y = j * %TARGET_ROWS; - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( i == j) - { - int ncols = ((ref_y + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_y); - int nrows = ((ref_x + %TARGET_ROWS) < N) ? %TARGET_ROWS : (N-ref_x); - int nElements = ((nrows) * ((ncols) + 1)) >> 1; - nrows -= 1; - ncols -= 1; - for(i = threadID; i < nElements; i += get_local_size(0)) - { - int r = -1, c = -1; - for(int k = 1; (k <= %TARGET_ROWS); k ++) - { - int temp = ((k - 1) * k) >> 1; - r = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : r; - } - c = i - (((r + 1) * r) >> 1); - - r = ref_x + r; - c = ref_y + c; - - %TYPE res1, res2; - res1 = alpha * X[c * incx]; - res2 = alpha * X[r * incx]; - res1 = res1 * Y[r * incx]; - res2 = res2 * Y[c * incx]; - - A[r + c * lda] += (res1 + res2); - } - } - else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region) - { - - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x + threadID); i < N; i += get_local_size(0)) - { - xShared[i - ref_x] = X[ i * incx]; - yShared[i - ref_x] = Y[ i * incy]; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(i - ref_y) ] = loadedX; - ySharedConj[(i - ref_y) ] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - - int startRow = ref_x + rowShift; - %TYPE%V loadedX, loadedY; - - if ( startRow < (N - (%V - 1)) ) - { - loadedX= *((__local %TYPE%V*)( xShared + rowShift)); - loadedY= *((__local %TYPE%V*)( yShared + rowShift)); - } - - for( int i= 1; i <= nLoops; i++) - { - int startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH; - - if ( ( startRow < N ) && ( startCol < (ref_y + %TARGET_ROWS ) ) )// threads that fall into target region - { - if(( startRow + %V) > N )// Loop serially as can't do VLOAD - { - %TYPE yValueConj = ySharedConj[ startCol - ref_y]; - %TYPE xValueConj = xSharedConj[ startCol - ref_y]; - - for(int row = startRow; row < N; row++) - { - %TYPE xValue = xShared[ row - ref_x]; - %TYPE yValue = yShared[ row - ref_x]; - - %TYPE res1, res2; - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, alpha, xValueConj); - %MAD( res2, res1, yValue); - A[ row + startCol * lda] += res2; - } - } - else - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - - %TYPE loadedYConj = ySharedConj[ startCol - ref_y]; - %TYPE loadedXConj = xSharedConj[ startCol - ref_y]; - %TYPE res; - - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - } - else // blocks that fit exactly. - { - - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0)) - { - xShared[i - ref_x] = X[ i * incx]; - yShared[i - ref_x] = Y[ i * incy]; - } - - // Populating yShared - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[i - ref_y] = loadedX; - ySharedConj[i - ref_y] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - int startRow = ref_x + rowShift; - int startCol = ref_y + colShift; - %TYPE%V loadedX, loadedY; - - if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region - { - loadedX = *((__local %TYPE%V*)( xShared + rowShift)); - loadedY = *((__local %TYPE%V*)( yShared + rowShift)); - } - - //#pragma unroll - for( int i= 1; i <= nLoops; i++) - { - startCol = ref_y + colShift + ( i - 1 ) * TARGET_WIDTH; - - if ( startCol < ( ref_y + %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - %TYPE loadedYConj = ySharedConj[ startCol - ref_y]; - %TYPE loadedXConj = xSharedConj[ startCol - ref_y]; - - // X * Y(H) - %TYPE res; - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - -} -\n"; - -//Column Major Upper -static const char *syr2_CU_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// Column-Major Upper -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXsyr2_CU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - //for ( _j = 0; _j < nBlocks; _j++) - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - - int ref_x = (N- 1) - i * %TARGET_ROWS; - int ref_y = (N- 1) - j * %TARGET_ROWS; - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( i == j) - { - int ncols = ((ref_y - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_y+1); - int nrows = ((ref_x - %TARGET_ROWS) >= 0) ? %TARGET_ROWS : (ref_x+1); - int nElements = ((nrows) * ((ncols) + 1)) >> 1; - nrows -= 1; - ncols -= 1; - for(i = threadID; i < nElements; i += get_local_size(0)) - { - int r, c = -1; - for(int k = 1; (k <= %TARGET_ROWS); k ++) - { - int temp = ((k - 1) * k) >> 1; - c = ((i >= temp) && (i <= (temp + k - 1))) ? k - 1 : c; - } - r = i - (((c + 1) * c) >> 1); - - r = ref_x - (nrows) + r; - c = ref_y - (ncols) + c; - - %TYPE res1, res2; - res1 = alpha * X[c * incx]; - res2 = alpha * X[r * incx]; - res1 = res1 * Y[r * incy]; - res2 = res2 * Y[c * incy]; - A[r + c * lda] += (res1 + res2); - } - } - else if ( i == (nBlocks - 1)) // Last Row Strip blocks ( May not fit into target region) - { - - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0)) - { - xShared[(%TARGET_ROWS - 1) - threadID] = X[ i * incx]; - yShared[(%TARGET_ROWS - 1) - threadID] = Y[ i * incy]; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(ref_y - i)] = loadedX; - ySharedConj[(ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - int startRow = ref_x - rowShift; - %TYPE%V loadedX, loadedY; - - if ( startRow >= 0 ) - { - loadedX= *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1) - rowShift])); - loadedY= *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1) - rowShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - int startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH; - - // threads that fall into target region - if( ( startRow > -(%V) ) && (startCol > (ref_y - %TARGET_ROWS)) ) - { - if( startRow < 0 )// Loop serially as can't do VLOAD - { - %TYPE yValueConj = ySharedConj[ ref_y - startCol]; - %TYPE xValueConj = xSharedConj[ ref_y - startCol]; - - for(int row = startRow + (%V - 1); row >= 0; row--) - { - %TYPE xValue = xShared[ %TARGET_ROWS - 1 - (ref_x - row)]; - %TYPE yValue = yShared[ %TARGET_ROWS - 1 - (ref_x - row)]; - - %TYPE res1, res2; - - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, alpha, xValueConj); - %MAD( res2, res1, yValue); - A[ row + startCol * lda] += res2; - } - } - else - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - - %TYPE loadedXConj = xSharedConj[ ref_y - startCol]; - %TYPE loadedYConj = ySharedConj[ ref_y - startCol]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - } - } - else // blocks that fit exactly. - { - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0)) - { - xShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = X[ i * incx]; - yShared[ (%TARGET_ROWS - 1) - (ref_x - i)] = Y[ i * incy]; - } - - // Populating yShared - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(ref_y - i)] = loadedX; - ySharedConj[(ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_WIDTH) + 1; - - int colShift = threadID / TARGET_ROWS_BY_VEC; - int rowShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - - int startRow = ref_x - rowShift; - int startCol = ref_y - colShift; - %TYPE%V loadedX, loadedY; - // Not all threads should do this.. - // Depends on whether blocksize width is > target_rows - if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region - { - loadedX= *((__local %TYPE%V*)( &xShared[ (%TARGET_ROWS - 1)- rowShift])); - loadedY= *((__local %TYPE%V*)( &yShared[ (%TARGET_ROWS - 1)- rowShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - startCol = ref_y - colShift - ( i - 1 ) * TARGET_WIDTH; - - if ( startCol > ( ref_y - %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow + startCol * lda])); - %TYPE loadedYConj = ySharedConj[ ref_y - startCol]; - %TYPE loadedXConj = xSharedConj[ ref_y - startCol]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedYConj, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedX, resVec); - - // Y * X(H) - %MUL(res, loadedXConj, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedY, resVec); - %VSTORE( loadedA, 0, (&A[ startRow + startCol * lda])); - } - } - - } -} -"; - -/* -//Row Major Lower -static const char *syr2_RL_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// -//nBlocks = 4 -// -//Blocks: 9 -// 7 8 -// 4 5 6 -// 0 1 2 3 -// - -// Row-Major Lower -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXsyr2_RL_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - - - int ref_x = ((nBlocks - 1) * %TARGET_ROWS) - (j * %TARGET_ROWS); - int ref_y = (i -j) * %TARGET_ROWS; - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( ref_x == ref_y ) - { - // Need only xShared, not using yShared - for( int i = (ref_x + threadID); (i < N && (i - ref_x) < %TARGET_ROWS); i += get_local_size(0)) - { - xShared[(i - ref_x)] = X[ i * incx]; - yShared[(i - ref_x)] = Y[ i * incy]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int rowShift = threadID / %TARGET_ROWS; - int colShift = (threadID & ( %TARGET_ROWS - 1)); - - int target_height = %BLOCKSIZE / %TARGET_ROWS; - - int nLoops = ((%TARGET_ROWS - 1)/ target_height) + 1; - int startRow = ref_x + rowShift; - - %TYPE yValue, xValue, yValueRead, xValueRead, yValueConj, xValueConj; - - // startCol remains constant on looping - // Therefore, following code is based on startCol - int startCol = ref_y + colShift; - - bool readXY = false; - for( int i= 1; i <= nLoops; i++) - { - startRow = ref_x + rowShift + ( i - 1 ) * target_height; - - bool activeThread = ( startCol < N) && ( startRow < N) && ((startRow - ref_x) < %TARGET_ROWS); - - if (activeThread) - { - // Avoid reading Y again if already read using readXY - if (( startRow >= startCol) && (!readXY)) - { - yValueRead = yShared[ startCol - ref_y]; - xValueRead = xShared[ startCol - ref_y]; - readXY = true; - } - - if ( startRow > startCol ) - { - %TYPE res1, res2; - yValueConj = yValueRead; - xValueConj = xValueRead; - - xValue = xShared[ startRow - ref_x ]; - yValue = yShared[ startRow - ref_x ]; - - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, alpha, xValueConj); - %MAD( res2, res1, yValue); - A[ startRow * lda + startCol] += res2; - } - else if ( startRow == startCol) // Diagonal - { - yValueConj = yValueRead; - xValueConj = xValueRead; - yValue = yValueRead; - xValue = xValueRead; - - %TYPE res1, res2; - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL( res2, res1, xValue); - - // Y * X(H) - %MUL(res1, alpha, xValueConj); - %MAD( res2, res1, yValue); - - // Discard the imaginary component of A - %ADD(A[ startRow * lda + startCol], A[ startRow * lda + startCol], res2); - } - } - } - } - else if ( ref_x == ((nBlocks - 1) * %TARGET_ROWS)) // Last Row Strip blocks ( May not fit into target region) - { - - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x + threadID); i < N; i += get_local_size(0)) - { - xShared[i - ref_x] = X[ i * incx]; - yShared[i - ref_x] = Y[ i * incy]; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(i - ref_y) ] = loadedX; - ySharedConj[(i - ref_y) ] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT) + 1; - - int rowShift = threadID / TARGET_ROWS_BY_VEC; - int colShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - - int startRow = ref_x + rowShift; - int startCol = ref_y + colShift; // Remains fixed - %TYPE%V loadedYConj, loadedXConj; - - if ( startRow < N ) - { - loadedYConj = *((__local %TYPE%V*)( ySharedConj + colShift)); - loadedXConj = *((__local %TYPE%V*)( xSharedConj + colShift)); - } - - for( int i= 1; i <= nLoops; i++) - { - int startRow = ref_x + rowShift + ( i - 1 ) * TARGET_HEIGHT; - - if ( startRow < N )// threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow * lda + startCol])); - - %TYPE loadedX= xShared[ startRow - ref_x]; - %TYPE loadedY= yShared[ startRow - ref_x]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedX, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedYConj, resVec); - - // Y * X(H) - %MUL(res, loadedY, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedXConj, resVec); - %VSTORE( loadedA, 0, (&A[ startRow * lda + startCol])); - } - } - } - else // blocks that fit exactly. - { - - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x + threadID); (i - ref_x) < %TARGET_ROWS; i += get_local_size(0)) - { - xShared[i - ref_x] = X[ i * incx]; - yShared[i - ref_x] = Y[ i * incy]; - } - - // Populating yShared - for( int i = (ref_y + threadID); (i - ref_y) < %TARGET_ROWS; i += get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[i - ref_y] = loadedX; - ySharedConj[i - ref_y] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT) + 1; - - int rowShift = threadID / TARGET_ROWS_BY_VEC; - int colShift = (threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V; - - int startRow = ref_x + rowShift; - int startCol = ref_y + colShift; - %TYPE%V loadedYConj, loadedXConj; - - if ( startRow < ( ref_x + %TARGET_ROWS) ) // threads that fall into target region - { - loadedYConj = *((__local %TYPE%V*)( ySharedConj + colShift)); - loadedXConj = *((__local %TYPE%V*)( xSharedConj + colShift)); - } - - //#pragma unroll - for( int i= 1; i <= nLoops; i++) - { - startRow = ref_x + rowShift + ( i - 1 ) * TARGET_HEIGHT; - - if ( startRow < ( ref_x + %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow * lda + startCol])); - - %TYPE loadedX = xShared[ startRow - ref_x]; - %TYPE loadedY = yShared[ startRow - ref_x]; - - %TYPE res; - // X * Y(H) - %MUL(res, loadedX, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedYConj, resVec); - - // X * Y(H) - %MUL(res, loadedY, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedXConj, resVec); - - %VSTORE( loadedA, 0, (&A[ startRow * lda + startCol])); - } - } - } -} -\n"; - -//Row Major Upper -static const char *syr2_RU_kernel = " - -#ifdef DOUBLE_PRECISION - #ifdef cl_khr_fp64 - #pragma OPENCL EXTENSION cl_khr_fp64 : enable - #else - #pragma OPENCL EXTENSION cl_amd_fp64 : enable - #endif -#endif - -#define TARGET_ROWS_BY_VEC (%TARGET_ROWS / %V) -#define TARGET_WIDTH (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -#define TARGET_HEIGHT (%BLOCKSIZE / TARGET_ROWS_BY_VEC ) -// -//nBlocks = 4 -// -//Blocks: 9 -// -// 3 2 1 0 -// 6 5 4 -// 7 8 -// 9 -// -// ----------- -// |<-threadID| -// | 3 2 1 0 | -// ----------(ref_x, ref_y) -// - -// Row-Major Upper -// nBlocks = (N - 1)/ TR + 1 -// totalBlocks = (nBlocks * ( nBlocks + 1)) / 2 -__kernel void %PREFIXher2_RU_kernel( __global const %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) -{ - - __global const %TYPE* X; - __global const %TYPE* Y; - __global %TYPE* A; - - __local %TYPE xShared[%TARGET_ROWS]; - __local %TYPE yShared[%TARGET_ROWS]; - __local %TYPE xSharedConj[%TARGET_ROWS]; - __local %TYPE ySharedConj[%TARGET_ROWS]; - - int nBlocks = ((N - 1) / %TARGET_ROWS) + 1; - - A = _A + offa; - - if ( incx < 0 ) // Goto end of vector - { - X = _X + offx - ( N - 1) * incx; - } - else - { - X = _X + offx; - } - - if ( incy < 0 ) // Goto end of vector - { - Y = _Y + offy - ( N - 1) * incy; - } - else - { - Y = _Y + offy; - } - - int blockID = get_group_id(0); - int threadID = get_local_id(0); - - __local int iShared; - __local int jShared; - - // Get (i,j) of Block - if ( threadID == 0) - { - int _i = 0, _j = 0; - for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) - { - _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; - if ( _i < nBlocks && ( _i >= 0) ) - { - break; - } - } - - iShared = _i; - jShared = _j; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int i = iShared; - int j = jShared; - - - int ref_x = (N - 1) - ((nBlocks - 1 - j) * %TARGET_ROWS); - int ref_y = (N - 1) - (i -j) * %TARGET_ROWS; - - // Load data into xShared and yShared - // Not a common task among blocks in the present implementation.. - - // Diagonal Blocks : Should handle not reading diagonal element complex value - // Diagonal blocks : Should handle the last block as well - // Scalar code in Present implementation - if ( ref_x == ref_y ) - { - // Need only xShared, not using yShared - // Can use ref_x or ref_y - for( int i = (ref_x - threadID); (i >= 0 && ((ref_x - i) < %TARGET_ROWS)); i -= get_local_size(0)) - { - xShared[ (ref_x - i)] = X[ i * incx]; - yShared[ (ref_x - i)] = Y[ i * incy]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - int rowShift = threadID / %TARGET_ROWS; - int colShift = (threadID & ( %TARGET_ROWS - 1)); - - int target_height = %BLOCKSIZE / %TARGET_ROWS; - - int nLoops = ((%TARGET_ROWS - 1)/ target_height) + 1; - int startRow = ref_x - rowShift; - - %TYPE yValue, xValue, yValueRead, xValueRead, xValueConj, yValueConj; - - int startCol = ref_y - colShift; // remains fixed on looping - - bool readXY = false; - for( int i= 1; i <= nLoops; i++) - { - startRow = ref_x - rowShift - ( i - 1 ) * target_height; - bool activeThread = ( startRow >= 0) && ( startCol >= 0) && (( ref_x - startRow) < %TARGET_ROWS); - - if ( activeThread) - { - // Avoid reading yValue again for threads that have it already while looping - if (( startCol >= startRow ) && (!readXY)) - { - xValueRead = xShared[ ref_y - startCol]; - yValueRead = yShared[ ref_y - startCol]; - readXY = true; - } - - if ( startCol > startRow ) - { - %TYPE res1, res2; - yValueConj = yValueRead; - xValueConj = xValueRead; - - xValue = xShared[ ref_x - startRow]; - yValue = yShared[ ref_x - startRow]; - - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL(res2, res1, xValue); - - // X * Y(H) - %MUL(res1, alpha, xValueConj); - %MAD( res2, res1, yValue); - - A[ startRow * lda + startCol] += res2; - } - else if (startRow == startCol) // Diagonal - { - // The y Values can be obtained from xValues - xValue = xValueRead; - yValue = yValueRead; - xValueConj = xValueRead; - yValueConj = yValueRead; - - %TYPE res1, res2; - // X * Y(H) - %MUL(res1, alpha, yValueConj); - %MUL(res2, res1, xValue); - // Y * X(H) - %MUL(res1, alpha, xValueConj); - %MAD(res2, res1, yValue); - - // Discard the imaginary component of A - %ADD(A[ startRow * lda + startCol], A[ startRow * lda + startCol], res2); - } - } - } - } - else if ( ref_x == (( N - 1) - ((nBlocks - 1) * %TARGET_ROWS))) // First Row Strip blocks ( May not fit into target region) - { - %TYPE%V loadedA; - - // Populating xShared: May not fit into target region - for( int i = (ref_x - threadID); i >= 0; i -= get_local_size(0)) - { - xShared[ref_x - i] = X[ i * incx]; - yShared[ref_x - i] = Y[ i * incy]; - } - - // Populating yShared: Always fits well.. - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedX; - ySharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT) + 1; - - int rowShift = threadID / TARGET_ROWS_BY_VEC; - int colShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - int startRow = ref_x - rowShift; - int startCol = ref_y - colShift; - - %TYPE%V loadedYConj, loadedXConj; - - if ( startRow >= 0 ) - { - loadedYConj = *((__local %TYPE%V*)( &ySharedConj[ (%TARGET_ROWS - 1) - colShift])); - loadedXConj = *((__local %TYPE%V*)( &xSharedConj[ (%TARGET_ROWS - 1) - colShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - int startRow = ref_x - rowShift - ( i - 1 ) * TARGET_HEIGHT; - - if ( startRow >= 0 ) - { - loadedA = %VLOAD( 0, (&A[ startRow * lda + startCol])); - - %TYPE loadedX = xShared[ ref_x - startRow]; - %TYPE loadedY = yShared[ ref_x - startRow]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedX, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedYConj, resVec); - - // Y * X(H) - %MUL(res, loadedY, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedXConj, resVec); - - %VSTORE( loadedA, 0, (&A[ startRow * lda + startCol])); - } - } - } - else // blocks that fit exactly. - { - %TYPE%V loadedA; - - // Populating xShared - for( int i = (ref_x - threadID); ((ref_x - i) < %TARGET_ROWS); i -= get_local_size(0)) - { - xShared[(ref_x - i)] = X[ i * incx]; - yShared[(ref_x - i)] = Y[ i * incy]; - } - - // Populating yShared - for( int i = (ref_y - threadID); (ref_y - i) < %TARGET_ROWS; i -= get_local_size(0)) - { - %TYPE loadedX = X[ i * incx]; - %TYPE loadedY = Y[ i * incy]; - xSharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedX; - ySharedConj[(%TARGET_ROWS - 1) - (ref_y - i)] = loadedY; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // Loop %TARGET_ROWS / TARGET_WIDTH times - int nLoops = ((%TARGET_ROWS - 1)/ TARGET_HEIGHT) + 1; - - int rowShift = threadID / TARGET_ROWS_BY_VEC; - int colShift = ((threadID & ( TARGET_ROWS_BY_VEC - 1)) * %V) + (%V - 1); - - - int startRow = ref_x - rowShift; - int startCol = ref_y - colShift; - - %TYPE%V loadedYConj, loadedXConj; - // Not all threads should do this.. - // Depends on whether blocksize width is > target_rows - if ( startRow > ( ref_x - %TARGET_ROWS) ) // threads that fall into target region - { - loadedYConj = *((__local %TYPE%V*)( &ySharedConj[ (%TARGET_ROWS - 1) - colShift])); - loadedXConj = *((__local %TYPE%V*)( &xSharedConj[ (%TARGET_ROWS - 1) - colShift])); - } - - for( int i= 1; i <= nLoops; i++) - { - startRow = ref_x - rowShift - ( i - 1 ) * TARGET_HEIGHT; - - if ( startRow > ( ref_x - %TARGET_ROWS) ) // threads that fall into target region - { - loadedA = %VLOAD( 0, (&A[ startRow * lda + startCol])); - - %TYPE loadedX = xShared[ ref_x - startRow]; - %TYPE loadedY = yShared[ ref_x - startRow]; - %TYPE res; - // X * Y(H) - %MUL(res, loadedX, alpha); - %TYPE%V resVec; - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedYConj, resVec); - - // Y * X(H) - %MUL(res, loadedY, alpha); - resVec = %VMAKEVEC(res); - %VMAD( loadedA, loadedXConj, resVec); - - %VSTORE( loadedA, 0, (&A[ startRow * lda + startCol])); - - } - } - - } -} -\n"; -*/ |