summaryrefslogtreecommitdiff
path: root/external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl
diff options
context:
space:
mode:
Diffstat (limited to 'external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl')
-rw-r--r--external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl1650
1 files changed, 0 insertions, 1650 deletions
diff --git a/external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl b/external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl
deleted file mode 100644
index 26f05267..00000000
--- a/external/clBLAS/src/library/blas/gens/clTemplates/gemm.cl
+++ /dev/null
@@ -1,1650 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-//
-// Few Observations:
-// 1. Vector Length of 4 increases the performance of DGEMM to 250GFLOPS.
-// Coupled with 8x8 block without "barrier", max performance seen is around 267GFLOPS on Cayman.
-// Otherwise, it is at 225GFLOPS max (16x8 block, with barrier, vector size of 2)
-// However, a change of tile-size with vector length of just 2 yields 330GFLOPS consistently.
-// So, 330 is our sweetspot for DGEMM now.
-// 2. When MxN is not completely divisible by [subdimy x subdimx] then a workgroup size of
-// 8x8 yields the best performance.
-// Even in this case, if extra threads exit - the performance should be better.
-// Thread exit should be done only if a barrier is NOT used. Otherwise, it will result in a hang.
-// 3. When processing non-tail run, workgroups processing full tiles can be grouped together and run
-// However, this did not yield any significant performance on tail processing
-// Sometimes, performance degradation was also seen. So, this idea will not be pursued.
-//
-// Pending Enhancements for GEMM:
-// -4. TN Kernel Performance can be improved. The prototype code shows better performance than the templated
-// code. The templated code slightly differs from the prototype code. This can be fixed to get more
-// performance.
-// -2. PENDING BUG FIX on the Unroll Factor for NN kernel - Configurable PANEL Implementation Introduced it.
-// Currently panel of %V only supported
-// 0. When workgroup size == WAVEFRONT Size, GEMM_NEEDS_BARRIER need not be defined.
-// This saves a few milliseconds depending on the problem size.
-// 1. Support for VLOADA, VLOADB and VLOADC in KPRINTF required. Currently, if any one of the matrices
-// are vector unfriendly, the kernel translates to a completely scalar kernel.
-// This is pretty easy to implement in KPRINTF.
-// 2. Panel Width == %V in the current implementation. It should be a separate config define
-// that can has to be a multiple of %V.
-// Currently only NxN kernel has %PANEL support implemented. TN and NT needs to be enhanced.
-// This will be required for tuning and also for high performance for D,C and ZGEMMs
-// 3. "actualRow" based improvement can be used in KTail Processing as well for NN Kernel
-// 4. A.B^T can be optimzed for cases where ITEMY > 4. Successive threads are now ITEMX apart
-// Instead, we can make them float4 apart to get highest L1 cache bandwidth
-// 5. A.B^T - actualCol, actualRow optimization
-//
-static const char *GEMM_NN_KERNEL = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-__kernel void GEMM_NN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
- uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
- %TYPE alpha, %TYPE beta
- #ifdef TAIL_RUN
- , uint tailStartM, uint tailStartN
- #endif
- )
-{
- const int V = %V;
- __global %TYPE const *restrict A;
- __global %TYPE const *restrict B;
- __global %TYPE *C = _C + offc;
- uint K = _K;
- uint lda, ldb;
- uint rowA, colA, rowB, colB, rowC, colC;
- uint numGroupsOnY;
- uint row, col;
- uint tid = get_local_id(0);
- int panel;
- int ACOLSTART, ACOLEND;
- uint MV;
-
- //
- // %WIDTH - Preferably 16
- // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
- // %ITEMY must be divisible by %V for NN kernel
- // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
- //
- uint threadsY = %WIDTH;
- uint threadsX = get_local_size(0)/threadsY;
-
- //
- // Column-Major ordering of Workgroups
- //
- // %ITEMY - Number of elements , a workitem processes in Y direction.
- // %ITEMX - Number of elements , a workitem processes in X direction.
- //
- // %V - Vectoring Width
- // %PANEL(*) - Panel Width to access Rows of A and Columns of B
- // Right now, %V is assumed to be the panel width.
- // We dont use %PANEL in the current implementation.
- //
- MV = M;
- #ifndef TAIL_RUN
- {
- uint bidX, bidY;
- uint blockDimY;
-
- #ifdef M_TAIL_PRESENT
- MV = M - (M % (%V));
- #endif
- if (MV == 0)
- {
- return;
- }
- blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
- bidY = ( get_group_id(0) % ( blockDimY));
- bidX = ( get_group_id(0) / ( blockDimY));
- //
- // Note:
- // Using the new Map function does not yeild any performnce gain.
- // In fact, it degraded the performance
- // Keep this commented.
- //
- //mapWorkGroupToTileNumber(M, N, &bidY, &bidX);
-
- //
- // <row,col> is the left-top of the TILE region
- // in the output C matrix that will be determined
- // by this workgroup
- //
- row = (bidY * (threadsY * %ITEMY));
- col = (bidX * (threadsX * %ITEMX));
- }
- #else
- {
- uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
- uint bidY, bidX;
-
- if (M == tailStartM)
- {
- nWorkGroupsA = 0;
- } else {
- nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1);
- nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1);
- nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
- }
- if (get_group_id(0) < nWorkGroupsA)
- {
- bidY = get_group_id(0) % (nWorkGroupsAY);
- bidX = get_group_id(0) / nWorkGroupsAY;
- row = tailStartM + (bidY * threadsY * %ITEMY);
- col = (bidX * threadsX * %ITEMX);
- } else {
- uint nWorkGroupsBY, nWorkGroupsBX;
-
- nWorkGroupsBY = ((M-1)/threadsY) + 1;
- nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1;
- bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY);
- bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY;
- row = (bidY * threadsY * %ITEMY);
- col = tailStartN + (bidX * threadsX * %ITEMX);
- }
-
- }
- #endif
-
- //
- // ACOLSTART, ACOLEND
- // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row
- // with panels on B's block-column.
- // However due to symmetric nature of A/B matrix compounded by the fact that
- // only upper OR lower triangle of the symm matrix is available, vector-loads
- // are not possible while traversing certain regions of the matrix.
- // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
- // this NN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
- //
-#ifdef __SYMM_LEFT__
- // MxM * MxN
- A = _A + offa;
- lda = _lda;
- B = _B + offb;
- ldb = _ldb;
- K = M;
- #ifndef __SYMM_DIAGONAL__
- #ifdef __SYMM_LOWER__
- ACOLSTART = 0;
- ACOLEND = row;
- #elif defined(__SYMM_UPPER__)
- ACOLSTART = row + (threadsY*(%ITEMY));
- ACOLEND = K;
- #else
- #error GEMM_NN_KERNEL
- #endif
- #else
- ACOLSTART = row;
- ACOLEND = row + (threadsY*(%ITEMY));
- #endif
- if (ACOLEND > K)
- {
- ACOLEND = K;
- }
- /*
- if (get_local_id(0) == 0)
- {
- printf(\" GEMM_NN_KERNEL : SYMM_LEFT: Setting ACOLSTART to %d and ACOLEND to %d \\n \" , ACOLSTART, ACOLEND);
- }
- */
-#elif defined(__SYMM_RIGHT__)
- // MxN * NxN
- A = _B + offb;
- lda = _ldb;
- B = _A + offa;
- ldb = _lda;
- K = N;
- #ifndef __SYMM_DIAGONAL__
- #ifdef __SYMM_UPPER__
- ACOLSTART = 0;
- ACOLEND = col;
- #elif defined(__SYMM_LOWER__)
- ACOLSTART = col + (threadsX*(%ITEMX));
- ACOLEND = K;
- #else
- #error GEMM_NN_KERNEl
- #endif
-#else
- ACOLSTART = col;
- ACOLEND = col + (threadsX*(%ITEMX));
- #endif
- if (ACOLEND > K)
- {
- ACOLEND = K;
- }
-#else
- A = _A + offa;
- B = _B + offb;
- K = _K;
- lda = _lda;
- ldb = _ldb;
- ACOLSTART = 0;
- ACOLEND = K;
-#endif
-
- uint offsetY = (tid % threadsY) * %V;
- uint offsetX = (tid / threadsY) * %ITEMX;
- rowA = row + offsetY;
- colB = (col+offsetX);
- #ifndef TAIL_RUN
- bool tailBlock = ((row >= M) || (col >= N));
- #else
- bool tailBlock = (row >= tailStartM);
- #endif
-
-
- /*
- #ifdef TAIL_RUN
- if ((rowA >= M) || (colB >= N))
- {
- return;
- }
- #endif
- */
-
- #ifndef TAIL_RUN
- // Non-tail RUN
- if (tailBlock == true)
- {
- return;
- }
- #elif defined(TAIL_RUN)
- // TAIL RUN
- if (tailBlock == false)
- {
- return;
- }
- #else
- #error GEMM_NN_KERNEL
- #endif
-
- %TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // 8
- #ifdef COMPLEX
- %TYPE%HV AVALEVEN[%V][(%ITEMY_BY_V)]; // 8
- %TYPE%HV AVALODD[%V][(%ITEMY_BY_V)]; // 8
- #endif
-
- %TYPE%V BVAL[%ITEMX];
- #ifdef COMPLEX
- %TYPE%HV BVALEVEN[%ITEMX];
- %TYPE%HV BVALODD[%ITEMX];
- #endif
-
- %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
- #ifdef COMPLEX
- %TYPE%HV CVALEVEN[(%ITEMY_BY_V)][%ITEMX];
- %TYPE%HV CVALODD[(%ITEMY_BY_V)][%ITEMX];
- #endif
-
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- CVAL[i][j] = (%TYPE%V) 0;
- #ifdef COMPLEX
- CVALEVEN[i][j] = (%TYPE%HV) 0;
- CVALODD[i][j] = (%TYPE%HV) 0;
- #endif
- }
- }
-
- uint ACOL;
- for(ACOL=ACOLSTART; ((ACOL+ %V -1) < ACOLEND); ACOL += %V)
- {
- {
- //
- // Load B values
- //
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint bcol = 0; bcol < %ITEMX; bcol++)
- {
- #ifdef N_TAIL_PRESENT
- uint actualCol;
- actualCol = ((colB + bcol) >= N) ? (N-1) : (colB + bcol);
- #endif
-
- #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__)
- #ifndef N_TAIL_PRESENT
- BVAL[bcol] = %VLOAD(0, (&B[ACOL + (colB + bcol)*ldb]));
- #else
- BVAL[bcol] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb]));
- #endif
- #else
- // defined(__SYMM_DIAGONAL__) && defined(__SYMM_RIGHT__)
- #ifndef N_TAIL_PRESENT
- BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, (colB + bcol));
- #else
- BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, actualCol);
- #endif
- #endif
- //
- // If Complex data, load the real and imaginary parts into separate register banks
- //
- #ifdef COMPLEX
- BVALEVEN[bcol] = BVAL[bcol].even;
- BVALODD[bcol] = BVAL[bcol].odd;
- #endif
- }
- }
-
- {
- //
- // Load A values
- //
- //
- // PENDNG BUG FIX: Unroll Factor should be according to PANEL Size
- // Previoously PANEL was size of V. So ITEMY worked
- // Current Workaround - Panel same as %V - See gemm_cached.cpp
- //
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint j=0; j< (%ITEMY_BY_V); j++)
- {
- #pragma unroll %V
- for(uint i = 0; i < %V; i++)
- {
- uint actualRow;
-
- #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__)
- #ifndef M_TAIL_PRESENT
- AVAL[i][j] = %VLOAD(0, (&A[(rowA + j*threadsY*(V)) + (ACOL + i)*lda]) );
- #else
- actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V));
- AVAL[i][j] = %VLOAD(0, (&A[actualRow + (ACOL + i)*lda]) );
- #endif
- #else
- // CASE: SYMM_DIAGONAL && SYMM_LEFT
- #ifndef M_TAIL_PRESENT
- //AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) );
- AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + j*threadsY*(V)) , (ACOL + i));
- #else
- actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V));
- AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, actualRow, (ACOL + i));
- #endif
- #endif
- //
- // If Complex data, load the real and imaginary parts into separate register banks
- //
- #ifdef COMPLEX
- AVALEVEN[i][j] = AVAL[i][j].even;
- AVALODD[i][j] = AVAL[i][j].odd;
- #endif
- }
- }
- }
-
- {
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i<(%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- #ifndef COMPLEX
- %VFOR_REAL
- {
- CVAL[i][j] = mad(AVAL[%VFORINDEX][i], BVAL[j]%VFORSUFFIX, CVAL[i][j]);
- }
- #else
- //
- // Pending - Replace by %COMPLEX_VMAD()
- //
- %VFOR_REAL
- {
- //
- // PENDING Needs a FIX
- //
- CVALEVEN[i][j] = mad(AVALEVEN[%VFORINDEX][i], BVALEVEN[j]%VFORSUFFIX, CVALEVEN[i][j]);
- CVALODD[i][j] = mad(AVALEVEN[%VFORINDEX][i], BVALODD[j]%VFORSUFFIX, CVALODD[i][j]);
- CVALEVEN[i][j] = mad(AVALODD[%VFORINDEX][i], -BVALODD[j]%VFORSUFFIX, CVALEVEN[i][j]);
- CVALODD[i][j] = mad(AVALODD[%VFORINDEX][i], BVALEVEN[j]%VFORSUFFIX, CVALODD[i][j]);
- }
- #endif
- }
- }
- }
-
- #ifdef GEMM_NEEDS_BARRIER
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
- }
-
- #ifdef COMPLEX
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]);
- }
- }
- #endif
-
- //
- // Tail blocks never execute this FOR loop as they execute with Vector Width of 1
- //
-
-
- for(; ACOL < ACOLEND; ACOL ++)
- {
- //
- // Load B values
- //
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint bcol = 0; bcol < %ITEMX; bcol++)
- {
- //
- // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
- //
- #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__)
- {
- %TYPE SCAL;
- #ifndef N_TAIL_PRESENT
- SCAL = B[ACOL + (colB + bcol)*ldb];
- BVAL[bcol] = %VMAKEVEC(SCAL);
- #else
- SCAL = B[ACOL + ((colB + bcol)%(N))*ldb];
- BVAL[bcol] = %VMAKEVEC(SCAL);
- #endif
- }
- #else
- // SYMM_DIAGONAL && SYMM_RIGHT
- {
- %TYPE SCAL;
-
- #ifndef N_TAIL_PRESENT
- SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL, (colB + bcol));
- BVAL[bcol] = %VMAKEVEC(SCAL);
- #else
- SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL, ((colB + bcol)%(N)));
- BVAL[bcol] = %VMAKEVEC(SCAL);
- #endif
- }
- #endif
- }
-
- //
- // Load A values
- //
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V
- {
- #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__)
- #ifndef M_TAIL_PRESENT
- AVAL[0][i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) );
- #else
- AVAL[0][i] = %VLOAD(0, (&A[(((rowA + i*threadsY*(V))) % (MV)) + (ACOL)*lda]) );
- #endif
- #else
- // defined(DIAGONAL) && (LEFT)
- #ifndef M_TAIL_PRESENT
- AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + i*threadsY*(V)) , (ACOL));
- #else
- AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ((rowA + i*threadsY*(V)) % (MV)), (ACOL));
- #endif
- #endif
- }
-
- {
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i<(%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- %VMAD(CVAL[i][j] , AVAL[0][i] , BVAL[j]);
- }
- }
- }
- }
-
-
- /*
- if ((get_group_id(0) == 0) && (get_local_id(0) == 0))
- {
- printf(\"Updating C Matrix: Alpha = %f, Beta = %f\\n\", alpha, beta);
- }
- */
- //
- // STORE Result in C
- //
- %TYPE%V reg , betareg, alphareg;
- %TYPE%V alphav, betav;
- alphav = %VMAKEVEC(alpha);
- betav = %VMAKEVEC(beta);
-
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
- reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVAL[i][j]);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
- #else
- if (((rowA + i*threadsY*V) < MV) && ((colB + j) < N))
- {
- reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVAL[i][j]);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
- }
- #endif
- }
- }
- return;
-}
-";
-
-static const char *GEMM_NT_KERNEL = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-//#undef COMPLEX
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
-__kernel void GEMM_NT__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
- uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
- %TYPE alpha, %TYPE beta
- #ifdef TAIL_RUN
- , uint tailStartM, uint tailStartN
- #endif
- )
-{
- const int V = %V;
- __global %TYPE const *restrict A;
- __global %TYPE const *restrict B;
- __global %TYPE *C = _C + offc;
- uint K = _K;
- uint lda, ldb;
- uint rowA, colA, rowB, colB, rowC, colC;
- uint numGroupsOnY;
- uint row, col;
- uint tid = get_local_id(0);
- int panel;
- int ACOLSTART, ACOLEND;
- uint MV, NV;
-
- //
- // %WIDTH - Preferably 16
- // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
- // %ITEMY and %ITEMX must be divisible by %V for NT kernel
- // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
- //
- uint threadsY = %WIDTH;
- uint threadsX = get_local_size(0)/threadsY;
-
- //
- // Column-Major ordering of Workgroups
- //
- // %ITEMY - Number of elements , a workitem processes in Y direction.
- // %ITEMX - Number of elements , a workitem processes in X direction.
- //
- // %V - Vectoring Width
- // %PANEL(*) - Panel Width to access Rows of A and Columns of B
- // Right now, %V is assumed to be the panel width.
- // We dont use %PANEL in the current implementation.
- //
- MV = M;
- NV = N;
- #ifndef TAIL_RUN
- {
- uint bidX, bidY;
- uint blockDimY;
-
- #ifdef M_TAIL_PRESENT
- MV = M - (M % (%V));
- if (MV == 0)
- {
- return;
- }
- #endif
- #ifdef N_TAIL_PRESENT
- NV = N - (N% (%V));
- if (NV == 0)
- {
- return;
- }
- #endif
- blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1;
- uint blockID = get_group_id(0);
- getBlockNumber(blockDimY, blockID, &bidY, &bidX, 1);
-
- //
- // <row,col> is the left-top of the TILE region
- // in the output C matrix that will be determined
- // by this workgroup
- //
- row = (bidY * (threadsY * %ITEMY));
- col = (bidX * (threadsX * %ITEMX));
- }
- #else
- {
- uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA;
- uint bidY, bidX;
-
- MV = M;
- if (M == tailStartM)
- {
- nWorkGroupsA = 0;
- } else {
- nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1);
- nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1);
- nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX;
- }
- if (get_group_id(0) < nWorkGroupsA)
- {
- bidY = get_group_id(0) % (nWorkGroupsAY);
- bidX = get_group_id(0) / nWorkGroupsAY;
- row = tailStartM + (bidY * threadsY * %ITEMY);
- col = (bidX * threadsX * %ITEMX);
- NV = tailStartN;
- } else {
- uint nWorkGroupsBY, nWorkGroupsBX;
-
- nWorkGroupsBY = ((M-1)/threadsY) + 1;
- nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1;
- bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY);
- bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY;
- row = (bidY * threadsY * %ITEMY);
- col = tailStartN + (bidX * threadsX * %ITEMX);
- NV = N;
- }
-
- }
- #endif
-
- //
- // ACOLSTART, ACOLEND
- // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row
- // with panels on B's block-column.
- // However due to symmetric nature of A matrix compounded by the fact that
- // only upper OR lower triangle of the symm matrix is available, vector-loads
- // are not possible while traversing certain regions of the matrix.
- // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
- // this NT kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
- //
-#ifdef __SYMM_LEFT__
- #error GEMM_NT_KERNEL Should not be called in __SYMM_LEFT__ case!
-#elif defined(__SYMM_RIGHT__)
- // MxN * NxN
- A = _B + offb;
- lda = _ldb;
- B = _A + offa;
- ldb = _lda;
- K = N;
- #ifndef __SYMM_DIAGONAL__
- #ifdef __SYMM_UPPER__
- ACOLSTART = col + (threadsX*(%ITEMX));
- ACOLEND = K;
- #elif defined(__SYMM_LOWER__)
- ACOLSTART = 0;
- ACOLEND = col;
- #else
- #error GEMM_NT_KERNEL : Neither SYMM_UPPER nor SYMM_LOWER is defined!
- #endif
- #else
- ACOLSTART = col;
- ACOLEND = col + (threadsX*(%ITEMX));
- #endif
- if (ACOLEND > K)
- {
- ACOLEND = K;
- }
-#else // GEMM
- A = _A + offa;
- B = _B + offb;
- K = _K;
- lda = _lda;
- ldb = _ldb;
- ACOLSTART = 0;
- ACOLEND = K;
-#endif
-
- uint offsetY = (tid % threadsY) * %V;
- uint offsetX = (tid / threadsY) * %ITEMX;
- rowA = row + offsetY;
- colB = col + offsetX;
- #ifndef TAIL_RUN
- bool tailBlock = ((row >= M) || (col >= N));
- #else
- bool tailBlock = ((row >= tailStartM) || (col >= tailStartN));
- #endif
-
- /* Should be handled with TAIL_PRESENT Macros.
- if ((rowA >= M) || (colB >= N))
- {
- return;
- }
- */
-
- #ifndef TAIL_RUN
- // Non-tail RUN
- if (tailBlock == true)
- {
- return;
- }
- #else
- // TAIL RUN - This case never happens.
- if (tailBlock == false)
- {
- return;
- }
- #endif
-
- %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX];
- #ifdef COMPLEX
- %TYPE%HV CVALEVEN[(%ITEMY_BY_V)][%ITEMX];
- %TYPE%HV CVALODD[(%ITEMY_BY_V)][%ITEMX];
- #endif
-
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- CVAL[i][j] = (%TYPE%V) 0;
- #ifdef COMPLEX
- CVALEVEN[i][j] = (%TYPE%HV) 0;
- CVALODD[i][j] = (%TYPE%HV) 0;
- #endif
- }
- }
-
- uint ACOL;
- for(ACOL=ACOLSTART; ((ACOL+%V-1) < ACOLEND); ACOL += %V /* %PANEL */)
- {
- %TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V]
- %TYPE%V BVAL[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX]
- #ifdef COMPLEX
- %TYPE%HV AVALEVEN[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V]
- %TYPE%HV AVALODD[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V]
- %TYPE%HV BVALEVEN[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX]
- %TYPE%HV BVALODD[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX]
- #endif
-
- {
- //
- // Load B values
- //
- %IF(%V) #pragma unroll %V
- for(uint panel=0; panel < %V; panel++)
- {
- %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V
- for(uint bcol = 0; bcol < %ITEMX_BY_V; bcol++)
- {
- //
- // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
- //
- #ifndef __SYMM_DIAGONAL__
- #ifndef N_TAIL_PRESENT
- BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + (colB + bcol*(V))]));
- #else
- BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + ((colB + bcol*V) % NV)]));
- #endif
- #else
- #ifndef N_TAIL_PRESENT
- BVAL[bcol][panel] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, (colB + bcol*(V)), (ACOL + panel));
- #else
- BVAL[bcol][panel] =
- SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ((colB + bcol*V) % NV), (ACOL + panel));
- #endif
- #endif
-
- #ifdef CONJUGATE_B
- %TYPE%V conjTemp = BVAL[bcol][panel];
- %CONJUGATE(1, conjTemp);
- BVAL[bcol][panel] = conjTemp;
- #endif
- #ifdef COMPLEX
- {
- BVALEVEN[bcol][panel] = BVAL[bcol][panel].even;
- BVALODD[bcol][panel] = BVAL[bcol][panel].odd;
- }
- #endif
- }
- }
-
- //
- // Load A values
- //
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++)
- {
- const uint yiterations = %ITEMY_BY_V;
- uint c = (i / yiterations);
- uint r = (i % yiterations);
-
- #ifndef M_TAIL_PRESENT
- AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) );
- #else
- AVAL[c][r] = %VLOAD(0, (&A[((rowA + r*threadsY*(V)) % MV) + (ACOL + c)*lda]) );
- #endif
-
- #ifdef COMPLEX
- AVALEVEN[c][r] = AVAL[c][r].even;
- AVALODD[c][r] = AVAL[c][r].odd;
- #endif
- }
- }
-
- %IF(%V) #pragma unroll %V
- for(uint panel=0; panel<(%V); panel++)
- {
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i<(%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V
- for(uint j=0; j<(%ITEMX_BY_V); j++)
- {
- const int CX = j * (%V);
-
- #ifndef COMPLEX
- %VFOR_REAL
- {
- CVAL[i][CX + %VFORINDEX] = mad(AVAL[panel][i], BVAL[j][panel]%VFORSUFFIX,
- CVAL[i][CX + %VFORINDEX]);
- }
- #else
- //
- // PENDING: Replace with %COMPLEX_MAD op
- //
- %VFOR_REAL
- {
- CVALEVEN[i][CX + %VFORINDEX] =
- mad(AVALEVEN[panel][i], BVALEVEN[j][panel]%VFORSUFFIX, CVALEVEN[i][CX + %VFORINDEX]);
- CVALODD[i][CX + %VFORINDEX] =
- mad(AVALEVEN[panel][i], BVALODD[j][panel]%VFORSUFFIX, CVALODD[i][CX + %VFORINDEX]);
- CVALEVEN[i][CX + %VFORINDEX] =
- mad(AVALODD[panel][i], -BVALODD[j][panel]%VFORSUFFIX, CVALEVEN[i][CX + %VFORINDEX]);
- CVALODD[i][CX + %VFORINDEX] =
- mad(AVALODD[panel][i], BVALEVEN[j][panel]%VFORSUFFIX, CVALODD[i][CX + %VFORINDEX]);
- }
- #endif
- }
- }
- }
-
- #ifdef GEMM_NEEDS_BARRIER
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
- }
-
- #ifdef COMPLEX
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]);
- }
- }
- #endif
-
- //
- // Tail blocks never execute this FOR loop as they execute with Vector Width of 1
- //
-
- for(; ACOL < ACOLEND; ACOL ++)
- {
- %TYPE%V AVAL[(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V]
- %TYPE BVAL[%ITEMX]; // [%PANEL][%ITEMX]
-
- //
- // Load B values
- //
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint bcol = 0; bcol < %ITEMX; bcol++)
- {
- %TYPE SCALAR;
- //
- // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
- //
- {
- #ifndef __SYMM_DIAGONAL__
- #ifndef N_TAIL_PRESENT
- SCALAR = B[ACOL*ldb + (colB + bcol)];
- #else
- SCALAR = B[ACOL*ldb + ((colB + bcol) % NV)];
- #endif
- #else
- #ifndef N_TAIL_PRESENT
- SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, (colB + bcol), ACOL );
- #else
- SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, ((colB + bcol) % NV), ACOL);
- #endif
- #endif
-
- #ifdef CONJUGATE_B
- %CONJUGATE(1, SCALAR);
- #endif
- BVAL[bcol] = (SCALAR);
- }
- }
-
- //
- // Load A values
- //
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V
- {
- #ifndef M_TAIL_PRESENT
- AVAL[i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) );
- #else
- AVAL[i] = %VLOAD(0, (&A[((rowA + i*threadsY*(V)) % MV) + (ACOL)*lda]) );
- #endif
- }
-
- {
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i<(%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- %VMAD(CVAL[i][j] , AVAL[i] , BVAL[j]);
- }
- }
- }
- }
-
- //
- // STORE Result in C
- //
- %TYPE%V reg , betareg, alphareg;
- %TYPE%V alphav, betav;
- alphav = %VMAKEVEC(alpha);
- betav = %VMAKEVEC(beta);
-
- #ifndef HERK
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
- reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVAL[i][j]);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
- #else
- if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV))
- {
- reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVAL[i][j]);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc]));
- }
- #endif
- }
- }
- #else
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i<(%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- int actualRow = rowA + i*threadsY*V;
- int actualCol = colB + j;
- #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
- {
- %VMUL(alphareg, alphav, CVAL[i][j]);
- //%TYPE temp[%V];
- //*(__private %TYPE%V *)(&temp) = alphareg;
- //#pragma unroll %V
- //for(uint r = 0; r < %V; r++)
- %VFOR
- {
- #ifdef HERK_LOWER_TRIANGLE
- if((actualRow + %VFORINDEX) >= (actualCol))
- #else
- if((actualRow + %VFORINDEX) <= (actualCol))
- #endif
- {
- %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc];
- %TYPE beta_s;
- %MUL(beta_s, beta, C_s);
- C_s = alphareg%VFORSUFFIX + beta_s;
- if((%VFORINDEX + actualRow) == actualCol)
- {
- C_s.odd = 0.0f;
- }
- C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
- }
- }
- }
- #else
- {
- if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV))
- {
- %VMUL(alphareg, alphav, CVAL[i][j]);
- //%TYPE temp[%V];
- //*(__private %TYPE%V *)(&temp) = alphareg;
- //#pragma unroll %V
- //for(uint r = 0; r < %V; r++)
- %VFOR
- {
- #ifdef HERK_LOWER_TRIANGLE
- if((%VFORINDEX + actualRow) >= (actualCol))
- #else
- if((%VFORINDEX + actualRow) <= (actualCol))
- #endif
- {
- %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc];
- %TYPE beta_s;
- %MUL(beta_s, beta, C_s);
- C_s = alphareg%VFORSUFFIX + beta_s;
- if((%VFORINDEX + actualRow) == actualCol)
- {
- C_s.odd = 0.0f;
- }
- C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
- }
- }
- }
- }
- #endif
- }
- }
- #endif
- return;
-}
-";
-
-
-static const char *GEMM_TN_KERNEL = "
-#ifdef DOUBLE_PRECISION
- #ifdef cl_khr_fp64
- #pragma OPENCL EXTENSION cl_khr_fp64 : enable
- #else
- #pragma OPENCL EXTENSION cl_amd_fp64 : enable
- #endif
-#endif
-
-//#pragma OPENCL EXTENSION cl_amd_printf : enable
-__kernel void GEMM_TN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C,
- uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc,
- %TYPE alpha, %TYPE beta
- #ifdef TAIL_RUN
- , uint tailStartM, uint tailStartN
- #endif
- )
-{
- const int V = %V;
- const int ITEMY = %ITEMY;
- __global %TYPE const *restrict A;
- __global %TYPE const *restrict B;
- __global %TYPE *C = _C + offc;
- uint K = _K;
- uint lda, ldb;
- uint rowA, colA, rowB, colB, rowC, colC;
- uint numGroupsOnY;
- uint row, col;
- uint tid = get_local_id(0);
- int panel;
- int ACOLSTART, ACOLEND;
- uint MV, bidX;
- uint bidY;
- uint blockDimX;
-
- //
- // %WIDTH - Preferably 16
- // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C
- // %ITEMY must be divisible by %V for NN kernel
- // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix
- //
- uint threadsY = %WIDTH;
- uint threadsX = get_local_size(0)/threadsY;
-
- //
- // Row-Major ordering of Workgroups
- //
- // %ITEMY - Number of elements , a workitem processes in Y direction.
- // %ITEMX - Number of elements , a workitem processes in X direction.
- //
- // %V - Vectoring Width
- // %PANEL(*) - Panel Width to access Rows of A and Columns of B
- // Right now, %V is assumed to be the panel width.
- // We dont use %PANEL in the current implementation.
- //
- MV = M;
- #ifndef TAIL_RUN
- {
-
- blockDimX = ((N-1) / (threadsX * %ITEMX)) + 1;
- uint blockID = get_group_id(0);
- getBlockNumber(blockDimX, blockID, &bidY, &bidX, 0);
-
- //
- // <row,col> is the left-top of the TILE region
- // in the output C matrix that will be determined
- // by this workgroup
- //
- row = (bidY * (threadsY * %ITEMY));
- col = (bidX * (threadsX * %ITEMX));
- }
- #else
- #error GEMM_TN_KERNEL: TAIL_RUN is NOT needed for TN Kernel!
- #endif
-
- //
- // ACOLSTART, ACOLEND
- // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row
- // with panels on B's block-column.
- // However due to symmetric nature of A/B matrix compounded by the fact that
- // only upper OR lower triangle of the symm matrix is available, vector-loads
- // are not possible while traversing certain regions of the matrix.
- // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through
- // this TN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels
- // SYMMETRIC LOAD routines are used when traversing the diaognal region wherease normal rules
- // hold good otherwise.
- //
-#ifdef __SYMM_LEFT__
- // MxM * MxN
- A = _A + offa;
- lda = _lda;
- B = _B + offb;
- ldb = _ldb;
- K = M;
- #ifndef __SYMM_DIAGONAL__
- #ifdef __SYMM_LOWER__
- ACOLSTART = row + (threadsY * %ITEMY);
- ACOLEND = K;
- /*
- if (get_local_id(0) == 0)
- {
- printf(\"GEMM_TN_KERNEL: SYMM_LOWER: Setting ACOLSTART to %d, ACOLEND = %d\\n\", ACOLSTART, ACOLEND);
- }
- */
- #elif defined(__SYMM_UPPER__)
- ACOLSTART = 0;
- ACOLEND = row;
- #else
- #error GEMM_TN_KERNEL
- #endif
- #else
- ACOLSTART = row;
- ACOLEND = row + (threadsY * %ITEMY);
- #endif
- if (ACOLEND > K)
- {
- ACOLEND = K;
- }
-#elif defined(__SYMM_RIGHT__)
- // MxN * NxN
- #error GEMM_TN_KERNEL: Internal Error: Should not be called in SYMM_RIGHT case! Right is Wrong!
-#else
- // GEMM Case
- A = _A + offa;
- B = _B + offb;
- K = _K;
- lda = _lda;
- ldb = _ldb;
- ACOLSTART = 0;
- ACOLEND = K;
-#endif
-
- uint offsetX = (tid % threadsX) * %ITEMX;
- uint offsetY = (tid / threadsX) * %ITEMY;
- rowA = (row + offsetY);
- colB = (col + offsetX);
- #ifndef TAIL_RUN
- bool tailBlock = ((row >= M) || (col >= N));
- #else
- #error GEMM_TN_KERNEL: No TAIL_RUN for TN case
- #endif
-
- %TYPE%V AVAL[%ITEMY]; // %ITEMY * %PANEL
- #ifdef COMPLEX
- %TYPE%HV AVALEVEN[%ITEMY]; // %ITEMY * %PANEL
- %TYPE%HV AVALODD[%ITEMY]; // %ITEMY * %PANEL
- #endif
-
- %TYPE%V BVAL[%ITEMX];
- #ifdef COMPLEX
- %TYPE%HV BVALEVEN[%ITEMX]; // %ITEMY * %PANEL
- %TYPE%HV BVALODD[%ITEMX]; // %ITEMY * %PANEL
- #endif
-
- %TYPE CVAL[%ITEMY][%ITEMX];
- #ifdef COMPLEX
- %TYPE%HV CVALEVEN[%ITEMY][%ITEMX]; // %ITEMY * %PANEL
- %TYPE%HV CVALODD[%ITEMY][%ITEMX]; // %ITEMY * %PANEL
- #endif
-
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(uint i=0; i< (%ITEMY); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- #ifdef COMPLEX
- CVAL[i][j] = (%TYPE) 0;
- CVALEVEN[i][j] = (%TYPE%HV) 0;
- CVALODD[i][j] = (%TYPE%HV) 0;
- #else
- CVAL[i][j] = (%TYPE) 0;
- #endif
- }
- }
-
- int ACOL;
- uint actualCol;
- uint actualRow;
- int ACOLENDV;
- int numIterations = (ACOLEND - ACOLSTART) / (%V) ;
-
- if (numIterations >= 0)
- {
- ACOLENDV = ACOLSTART + (numIterations * (%V));
- } else {
- ACOLENDV = ACOLEND;
- }
-
-
- if (ldb % (512) == 0) // PENDING: 512 needs to be a configurable
- {
- //
- // ASSUMPTION(SYMM Variants): \"ACOLSTART\" is perfectly divisble by \"%V\"
- // ACOLSTART depends on the tile size on Y direction
- // Since Vector-sizes are hardly 1, 2,4, 8 or 16, we can assume that
- // this is indeed the case
- //
-
- //
- // Assumption is that 32/16/8 is divisble by any value in %V
- //
- int num32Iterations = (ACOLENDV - ACOLSTART) / (32/(sizeof(%TYPE)/sizeof(float)));
- if (num32Iterations <= 0)
- {
- ACOL = ACOLSTART;
- } else {
- int startIteration = bidX % num32Iterations;
- ACOL = ACOLSTART + ( startIteration * (32/(sizeof(%TYPE)/sizeof(float))));
- }
- } else {
- ACOL = ACOLSTART;
- }
-
- for(int itr=0; itr<numIterations; itr++)
- {
- {
- //
- // Load A values
- //
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(int i = 0; i < %ITEMY; i++)
- {
- #ifndef __SYMM_DIAGONAL__
- #ifndef M_TAIL_PRESENT
- AVAL[i] = %VLOAD(0, (&A[(rowA + i)*lda + ACOL]) );
- #else
- actualRow = ((rowA + i) >= MV) ? (MV-1) : (rowA + i);
- AVAL[i] = %VLOAD(0, (&A[actualRow*lda + ACOL]) );
- #endif
- #else
- #ifndef M_TAIL_PRESENT
- AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, (rowA+i));
- #else
- actualRow = ((rowA + i) >= MV) ? (MV-1) : (rowA + i);
- AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, actualRow);
- #endif
- #endif
-
- #ifdef CONJUGATE_A
- %TYPE%V conjTemp = AVAL[i];
- %CONJUGATE(1, conjTemp);
- AVAL[i] = conjTemp;
- #endif
-
- #ifdef COMPLEX
- AVALEVEN[i] = AVAL[i].even;
- AVALODD[i] = AVAL[i].odd;
- #endif
- }
-
- //
- // Load B values
- //
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(int j=0; j<(%ITEMX); j++)
- {
- #ifndef N_TAIL_PRESENT
- BVAL[j] = %VLOAD(0, (&B[ACOL + (colB + j)*ldb]));
- #else
- actualCol = ((colB + j) >= N) ? (N-1) : (colB + j);
- BVAL[j] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb]));
- #endif
-
- #ifdef COMPLEX
- BVALEVEN[j] = BVAL[j].even;
- BVALODD[j] = BVAL[j].odd;
- #endif
- }
- } // LOAD A and B Over
-
-
- // MATH Begin
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(int j=0; j<(%ITEMX); j++)
- {
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(int i=0; i<(%ITEMY); i++)
- {
- #ifndef COMPLEX
- %VMAD_AND_REDUCE(CVAL[i][j] , AVAL[i], BVAL[j]);
- #else
- CVALEVEN[i][j] = mad(AVALEVEN[i], BVALEVEN[j], CVALEVEN[i][j]);
- CVALEVEN[i][j] = mad(AVALODD[i], -BVALODD[j], CVALEVEN[i][j]);
- CVALODD[i][j] = mad(AVALEVEN[i], BVALODD[j], CVALODD[i][j]);
- CVALODD[i][j] = mad(AVALODD[i], BVALEVEN[j], CVALODD[i][j]);
- /*
- EVENSUM = AVALEVEN[i] * BVALEVEN[j];
- EVENSUM = mad(AVALODD[i], -BVALODD[j], EVENSUM);
- ODDSUM = AVALEVEN[i]*BVALODD[j];
- ODDSUM = mad(AVALODD[i], BVALEVEN[j], ODDSUM);
- CVAL[i][j].S0 += EVENSUM.S0 + EVENSUM.S1;
- CVAL[i][j].S1 += ODDSUM.S0 + ODDSUM.S1;
- */
- #endif
- }
- }
-
- ACOL = ((ACOL + %V) == ACOLENDV) ? ACOLSTART : (ACOL + %V); //%PANEL
- }
-
- #ifdef COMPLEX
- {
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(uint i=0; i< (%ITEMY); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- CVAL[i][j].even = %REDUCE_SUM_REAL_HV(CVALEVEN[i][j]);
- CVAL[i][j].odd = %REDUCE_SUM_REAL_HV(CVALODD[i][j]);
- }
- }
- }
- #endif
-
- ACOL = ACOLENDV;
-
- for(; ACOL < ACOLEND; ACOL ++)
- {
- //
- // Load B values
- //
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint bcol = 0; bcol < %ITEMX; bcol++)
- {
- //
- // PENDING: PANEL iteration to Load the Panel Depth iterating by %V
- //
- #ifndef N_TAIL_PRESENT
- BVAL[bcol] = %VMAKEVEC(B[ACOL + (colB + bcol)*ldb]);
- #else
- BVAL[bcol] = %VMAKEVEC(B[ACOL + ((colB + bcol)%(N))*ldb]);
- #endif
- }
-
- //
- // Load A values
- //
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(uint i = 0; i < (%ITEMY); i++) // 1 * ITEMY/V
- {
- #ifndef __SYMM_DIAGONAL__
- {
- #ifndef M_TAIL_PRESENT
- AVAL[i] = %VMAKEVEC(A[(rowA + i)*lda + ACOL]);
- #else
- AVAL[i] = %VMAKEVEC(A[((rowA + i) % MV)*lda + ACOL]);
- #endif
- }
- #else
- {
- %TYPE t;
- #ifndef M_TAIL_PRESENT
- t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, (rowA+i) );
- #else
- t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, ((rowA + i) % MV));
- #endif
- AVAL[i] = %VMAKEVEC(t);
- }
- #endif
- #ifdef CONJUGATE_A
- %CONJUGATE(1, AVAL[i]);
- #endif
- }
-
- {
- %IF(%ITEMY) #pragma unroll %ITEMY
- for(uint i=0; i<(%ITEMY); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- %MAD_AND_REDUCE(CVAL[i][j] , AVAL[i] , BVAL[j]);
- }
- }
- }
- }
-
-
- //
- // STORE Result in C
- //
- %TYPE%V reg , betareg, alphareg;
- %TYPE reg_s , betareg_s, alphareg_s;
- %TYPE%V alphav, betav;
- alphav = %VMAKEVEC(alpha);
- betav = %VMAKEVEC(beta);
- //%TYPE CVALV_TEMP[%V];
- %TYPE%V CVALV;
-
- #ifndef HERK
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- //#pragma unroll %V
- //for(uint k=0; k< (%V); k++)
- %VFOR
- {
- CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j];
- }
- //CVALV = *(__private %TYPE%V *)CVALV_TEMP;
-
- #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
- reg = %VLOAD(0, (&C[(rowA + i*V) + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVALV);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc]));
- #else
- if (((rowA + i*V + V - 1) < M) && ((colB + j) < N))
- {
- reg = %VLOAD(0, (&C[rowA + i*V + (colB+j)*ldc]));
- %VMUL(betareg, betav, reg);
- %VMUL(alphareg, alphav, CVALV);
- %ADD( reg, betareg, alphareg);
- %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc]));
- } else {
- if ((colB + j) < N)
- {
- //%TYPE TEMP[%V];
- //*(__private %TYPE%V *) TEMP = CVALV;
- //#pragma unroll %V
- //for(uint v=0; ((v< %V) && ((rowA + (i * %V) + v) < M) ); v++)
- %VFOR
- {
- if (((rowA + (i * %V) + %VFORINDEX) < M) )
- {
- %TYPE c;
-
- c = C[rowA + i*V + %VFORINDEX + (colB+j)*ldc];
- %MUL(betareg_s, c, beta);
- c = CVALV%VFORSUFFIX;
- %MUL(alphareg_s, c, alpha);
- %ADD(c, betareg_s, alphareg_s);
- C[rowA + i*V + %VFORINDEX + (colB+j)*ldc] = c;
- }
- }
- }
- }
- #endif
- }
- }
- #else
- %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V
- for(uint i=0; i< (%ITEMY_BY_V); i++)
- {
- %IF(%ITEMX) #pragma unroll %ITEMX
- for(uint j=0; j<(%ITEMX); j++)
- {
- int actualRow = rowA + i*V;
- int actualCol = colB + j;
-
- //#pragma unroll %V
- //for(uint k=0; k< (%V); k++)
- %VFOR
- {
- CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j];
- }
- //CVALV = *(__private %TYPE%V *)CVALV_TEMP;
-
- #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT)
- %VMUL(alphareg, alphav, CVALV);
- //%TYPE temp[%V];
- //*(__private %TYPE%V *)(&temp) = alphareg;
- //#pragma unroll %V
- //for(uint r = 0; r < %V; r++)
- %VFOR
- {
- #ifdef HERK_LOWER_TRIANGLE
- if((%VFORINDEX + actualRow) >= (actualCol))
- #else
- if((%VFORINDEX + actualRow) <= (actualCol))
- #endif
- {
- %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc];
- %TYPE beta_s;
- %MUL(beta_s, beta, C_s);
- C_s = alphareg%VFORSUFFIX + beta_s;
- if((%VFORINDEX + actualRow) == actualCol)
- {
- C_s.odd = 0.0f;
- }
- C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
- }
- }
- #else
- if (((rowA + i*V + V - 1) < M) && ((colB + j) < N))
- {
- %VMUL(alphareg, alphav, CVALV);
- //%TYPE temp[%V];
- //*(__private %TYPE%V *)(&temp) = alphareg;
- //#pragma unroll %V
- //for(uint r = 0; r < %V; r++)
- %VFOR
- {
- #ifdef HERK_LOWER_TRIANGLE
- if((%VFORINDEX + actualRow) >= (actualCol))
- #else
- if((%VFORINDEX + actualRow) <= (actualCol))
- #endif
- {
- %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc];
- %TYPE beta_s;
- %MUL(beta_s, beta, C_s);
- C_s = alphareg%VFORSUFFIX + beta_s;
- if((%VFORINDEX + actualRow) == actualCol)
- {
- C_s.odd = 0.0f;
- }
- C[%VFORINDEX + actualRow + actualCol * ldc] = C_s;
- }
- }
- }
- else
- {
- if ((colB + j) < N)
- {
- //%TYPE TEMP[%V];
-
- //*(__private %TYPE%V *)(&TEMP) = CVALV;
- //#pragma unroll %V
- //for(uint r=0; ((r< %V) && ((rowA + (i * %V) + r) < M) ); r++)
- %VFOR
- {
- if (((rowA + (i * %V) + %VFORINDEX) < M))
- {
- #ifdef HERK_LOWER_TRIANGLE
- if((%VFORINDEX + actualRow) >= (actualCol))
- #else
- if((%VFORINDEX + actualRow) <= (actualCol))
- #endif
- {
- %TYPE c;
- c = C[%VFORINDEX + actualRow + (actualCol)*ldc];
- %MUL(betareg_s, c, beta);
- c = CVALV%VFORSUFFIX;
- %MUL(alphareg_s, c, alpha);
- %ADD(c, betareg_s, alphareg_s);
- if((%VFORINDEX + actualRow) == (actualCol))
- {
- c.odd = 0.0f;
- }
- C[%VFORINDEX + actualRow + actualCol * ldc] = c;
- }
- }
- }
- }
- }
- #endif
- }
- }
- #endif
- return;
-}
-";
-