// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It // supports single and double precision (SGEMM/DGEMM) through a pre-processor define. // // Matrices are accessed as follows: // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) // B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) // C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) // // Or as an image (assuming column-major) // K // o-------o // | | // N | [B^T] | // | | // o-------o // K N // o-------o o-----o // M | [A] | M | [C] | // | | | | // o-------o o-----o // // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // ================================================================================================= // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. #ifndef MWG #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) #endif #ifndef NWG #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) #endif #ifndef KWG #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) #endif #ifndef MDIMC #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) #endif #ifndef NDIMC #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) #endif #ifndef MDIMA #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA #endif #ifndef NDIMB #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB #endif #ifndef KWI #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) #endif #ifndef VWM #define VWM 1 // Vector width of matrices A and C #endif #ifndef VWN #define VWN 1 // Vector width of matrix B #endif #ifndef STRM #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) #endif #ifndef STRN #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) #endif #ifndef SA #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) #endif #ifndef SB #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) #endif // Helper parameters based on the above tuning parameters #define MWI (MWG/MDIMC) // Work per work-item (M-dimension) #define NWI (NWG/NDIMC) // Work per work-item (N-dimension) #define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA #define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB #define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) #define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) #define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) #define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) // Settings #define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually // ================================================================================================= // Data-widths in dimension M #if VWM == 1 typedef real realM; #elif VWM == 2 typedef real2 realM; #elif VWM == 4 typedef real4 realM; #elif VWM == 8 typedef real8 realM; #elif VWM == 16 typedef real16 realM; #endif // Data-widths in dimension N #if VWN == 1 typedef real realN; #elif VWN == 2 typedef real2 realN; #elif VWN == 4 typedef real4 realN; #elif VWN == 8 typedef real8 realN; #elif VWN == 16 typedef real16 realN; #endif // ================================================================================================= // Initializes the accumulation registers to zero inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { #pragma unroll for (int mi=0; mi local (matrix A) #if SA == 1 GlobalToLocalA(agm, alm, kSizeM, tid, kwg); #endif // Loads data: off-chip --> local (matrix B) #if SB == 1 GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); #endif #if SA == 1 || SB == 1 barrier(CLK_LOCAL_MEM_FENCE); #endif // Loops over all workitem tiles, unrolled by a factor KWI for (int pwi=0; pwi private (matrix A) #if SA == 1 LocalToPrivateA(alm, apm, kg); // Loads data: off-chip --> private (matrix A) #else GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); #endif // Loads data: local --> private (matrix B) #if SB == 1 LocalToPrivateB(blm, bpm, kg); // Loads data: off-chip --> private (matrix B) #else GlobalToPrivateB(bgm, bpm, kSizeN, idk); #endif // Performs the accumulation (Cpm += Apm * Bpm) MultiplyAccumulate(cpm, apm, bpm); } } #if SA == 1 || SB == 1 barrier(CLK_LOCAL_MEM_FENCE); #endif } } // ================================================================================================= // The upper-triangular and lower-triangular kernels are only used in special cases #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void XgemmUpper(const int kSizeN, const int kSizeK, const real alpha, const real beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { // Skip these threads if they do not contain threads contributing to the upper-triangle if (get_group_id(1)*NWG < get_group_id(0)*MWG) { return; } // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in register memory realM cpm[NWI][MWI/VWM]; #if SA == 1 && SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); #elif SA == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); #elif SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); #else XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta StoreResults(cgm, cpm, kSizeN, alpha, beta); } // Main entry point of the kernel. This is the lower-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void XgemmLower(const int kSizeN, const int kSizeK, const real alpha, const real beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { // Skip these threads if they do not contain threads contributing to the lower-triangle if (get_group_id(1)*NWG > get_group_id(0)*MWG) { return; } // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in register memory realM cpm[NWI][MWI/VWM]; #if SA == 1 && SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); #elif SA == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); #elif SB == 1 XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); #else XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta StoreResults(cgm, cpm, kSizeN, alpha, beta); } // ================================================================================================= // If not using a triangular version, include the regular kernel #else // Main entry point of the kernel. This is the regular full version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real alpha, const real beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; #endif #if SB == 1 __local realN blm[KWG * NWG/VWN]; #endif // Computes the matrix-multiplication and stores the result in register memory realM cpm[NWI][MWI/VWM]; #if SA == 1 && SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); #elif SA == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); #elif SB == 1 XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); #else XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta StoreResults(cgm, cpm, kSizeM, alpha, beta); } #endif // ================================================================================================= // End of the C++11 raw string literal )" // =================================================================================================