// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any // pre and and post-processing kernels. // // This kernel is seperated into three files. This is part 1 out of 3. // // ================================================================================================= // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string // literal). Comment-out this line for syntax-highlighting when developing. R"( // Parameters set by the tuner or by the database. Here they are given a basic default value in case // this kernel file is used outside of the CLBlast library. Note that all parameters here have a // suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. #ifndef WGD #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) #endif #ifndef MDIMCD #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) #endif #ifndef NDIMCD #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) #endif #ifndef MDIMAD #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD #endif #ifndef NDIMBD #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD #endif #ifndef KWID #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) #endif #ifndef VWMD #define VWMD 1 // Vector width of matrices A and C #endif #ifndef VWND #define VWND 1 // Vector width of matrix B #endif #ifndef PADA #define PADA 1 // Local memory padding for matrix A #endif #ifndef PADB #define PADB 1 // Local memory padding for matrix B #endif // Helper parameters based on the above tuning parameters #define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) #define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) #define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD #define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD #define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) #define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) #define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) #define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) // ================================================================================================= // Data-widths in dimension M #if VWMD == 1 typedef real realMD; #elif VWMD == 2 typedef real2 realMD; #elif VWMD == 4 typedef real4 realMD; #elif VWMD == 8 typedef real8 realMD; #elif VWMD == 16 typedef real16 realMD; #endif // Data-widths in dimension N #if VWND == 1 typedef real realND; #elif VWND == 2 typedef real2 realND; #elif VWND == 4 typedef real4 realND; #elif VWND == 8 typedef real8 realND; #elif VWND == 16 typedef real16 realND; #endif // ================================================================================================= // Initializes the accumulation registers to zero inline void InitAccRegistersDirect(real cpm[NWID][MWID]) { #pragma unroll for (int mi=0; mi