From bf84463ab20f2f39071719fad9bd28a6bb13fc24 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 8 Feb 2016 20:06:02 +0100 Subject: Separated the GEMM kernel in two parts to reduce string length for MSVC --- src/kernels/level3/xgemm.opencl | 683 ---------------------------------- src/kernels/level3/xgemm_part1.opencl | 329 ++++++++++++++++ src/kernels/level3/xgemm_part2.opencl | 379 +++++++++++++++++++ 3 files changed, 708 insertions(+), 683 deletions(-) delete mode 100644 src/kernels/level3/xgemm.opencl create mode 100644 src/kernels/level3/xgemm_part1.opencl create mode 100644 src/kernels/level3/xgemm_part2.opencl (limited to 'src/kernels') diff --git a/src/kernels/level3/xgemm.opencl b/src/kernels/level3/xgemm.opencl deleted file mode 100644 index 8db0f557..00000000 --- a/src/kernels/level3/xgemm.opencl +++ /dev/null @@ -1,683 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto -// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable -// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It -// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. -// -// Matrices are accessed as follows: -// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) -// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) -// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) -// -// Or as an image (assuming column-major) -// K -// o-------o -// | | -// N | [B^T] | -// | | -// o-------o -// K N -// o-------o o-----o -// M | [A] | M | [C] | -// | | | | -// o-------o o-----o -// -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef MWG - #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) -#endif -#ifndef NWG - #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) -#endif -#ifndef KWG - #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) -#endif -#ifndef MDIMC - #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) -#endif -#ifndef NDIMC - #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) -#endif -#ifndef MDIMA - #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA -#endif -#ifndef NDIMB - #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB -#endif -#ifndef KWI - #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) -#endif -#ifndef VWM - #define VWM 1 // Vector width of matrices A and C -#endif -#ifndef VWN - #define VWN 1 // Vector width of matrix B -#endif -#ifndef STRM - #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) -#endif -#ifndef STRN - #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) -#endif -#ifndef SA - #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) -#endif -#ifndef SB - #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) -#endif - -// Helper parameters based on the above tuning parameters -#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) -#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) -#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA -#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB -#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) -#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) -#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) -#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) - -// Settings -#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually - -// ================================================================================================= - -// Data-widths in dimension M -#if VWM == 1 - typedef real realM; -#elif VWM == 2 - typedef real2 realM; -#elif VWM == 4 - typedef real4 realM; -#elif VWM == 8 - typedef real8 realM; -#elif VWM == 16 - typedef real16 realM; -#endif - -// Data-widths in dimension N -#if VWN == 1 - typedef real realN; -#elif VWN == 2 - typedef real2 realN; -#elif VWN == 4 - typedef real4 realN; -#elif VWN == 8 - typedef real8 realN; -#elif VWN == 16 - typedef real16 realN; -#endif - -// ================================================================================================= - -// Initializes the accumulation registers to zero -inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { - #pragma unroll - for (int mi=0; mi local (matrix A) - #if SA == 1 - GlobalToLocalA(agm, alm, kSizeM, tid, kwg); - #endif - // Loads data: off-chip --> local (matrix B) - #if SB == 1 - GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); - #endif - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - - // Loops over all workitem tiles, unrolled by a factor KWI - for (int pwi=0; pwi private (matrix A) - #if SA == 1 - LocalToPrivateA(alm, apm, kg); - // Loads data: off-chip --> private (matrix A) - #else - GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); - #endif - - // Loads data: local --> private (matrix B) - #if SB == 1 - LocalToPrivateB(blm, bpm, kg); - // Loads data: off-chip --> private (matrix B) - #else - GlobalToPrivateB(bgm, bpm, kSizeN, idk); - #endif - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulate(cpm, apm, bpm); - } - } - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - } -} - -// ================================================================================================= -// The upper-triangular and lower-triangular kernels are only used in special cases -#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) - -// Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Skip these threads if they do not contain threads contributing to the upper-triangle - if (get_group_id(1)*NWG < get_group_id(0)*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (get_group_id(1)*NWG > get_group_id(0)*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// ================================================================================================= -// If not using a triangular version, include the regular kernel -#else - -// Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeM, alpha, beta); -} - -#endif -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl new file mode 100644 index 00000000..4cb0585b --- /dev/null +++ b/src/kernels/level3/xgemm_part1.opencl @@ -0,0 +1,329 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto +// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable +// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It +// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. +// +// Matrices are accessed as follows: +// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) +// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) +// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) +// +// Or as an image (assuming column-major) +// K +// o-------o +// | | +// N | [B^T] | +// | | +// o-------o +// K N +// o-------o o-----o +// M | [A] | M | [C] | +// | | | | +// o-------o o-----o +// +// +// This kernel is seperated into two files. This is part 1 out of 2, +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef MWG + #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) +#endif +#ifndef NWG + #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) +#endif +#ifndef KWG + #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) +#endif +#ifndef MDIMC + #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMC + #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMA + #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#endif +#ifndef NDIMB + #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#endif +#ifndef KWI + #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) +#endif +#ifndef VWM + #define VWM 1 // Vector width of matrices A and C +#endif +#ifndef VWN + #define VWN 1 // Vector width of matrix B +#endif +#ifndef STRM + #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) +#endif +#ifndef STRN + #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) +#endif +#ifndef SA + #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) +#endif +#ifndef SB + #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) +#endif + +// Helper parameters based on the above tuning parameters +#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) +#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) +#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) + +// Settings +#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually + +// ================================================================================================= + +// Data-widths in dimension M +#if VWM == 1 + typedef real realM; +#elif VWM == 2 + typedef real2 realM; +#elif VWM == 4 + typedef real4 realM; +#elif VWM == 8 + typedef real8 realM; +#elif VWM == 16 + typedef real16 realM; +#endif + +// Data-widths in dimension N +#if VWN == 1 + typedef real realN; +#elif VWN == 2 + typedef real2 realN; +#elif VWN == 4 + typedef real4 realN; +#elif VWN == 8 + typedef real8 realN; +#elif VWN == 16 + typedef real16 realN; +#endif + +// ================================================================================================= + +// Initializes the accumulation registers to zero +inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { + #pragma unroll + for (int mi=0; mi +// +// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// The vectorised multiply-add function +inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) { + #if USE_VECTOR_MAD == 1 + cvec += avec * bval; + #else + #if VWM == 1 + MultiplyAdd(cvec, avec, bval); + #elif VWM == 2 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + #elif VWM == 4 + MultiplyAdd(cvec.x , avec.x, bval); + MultiplyAdd(cvec.y , avec.y, bval); + MultiplyAdd(cvec.z , avec.z, bval); + MultiplyAdd(cvec.w , avec.w, bval); + #elif VWM == 8 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + #elif VWM == 16 + MultiplyAdd(cvec.s0, avec.s0, bval); + MultiplyAdd(cvec.s1, avec.s1, bval); + MultiplyAdd(cvec.s2, avec.s2, bval); + MultiplyAdd(cvec.s3, avec.s3, bval); + MultiplyAdd(cvec.s4, avec.s4, bval); + MultiplyAdd(cvec.s5, avec.s5, bval); + MultiplyAdd(cvec.s6, avec.s6, bval); + MultiplyAdd(cvec.s7, avec.s7, bval); + MultiplyAdd(cvec.s8, avec.s8, bval); + MultiplyAdd(cvec.s9, avec.s9, bval); + MultiplyAdd(cvec.sA, avec.sA, bval); + MultiplyAdd(cvec.sB, avec.sB, bval); + MultiplyAdd(cvec.sC, avec.sC, bval); + MultiplyAdd(cvec.sD, avec.sD, bval); + MultiplyAdd(cvec.sE, avec.sE, bval); + MultiplyAdd(cvec.sF, avec.sF, bval); + #endif + #endif + return cvec; +} + +// Performs the actual computation: Cpm += Apm * Bpm +inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) { + #pragma unroll + for (int ni=0; ni local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } +} + +// ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + +// Main entry point of the kernel. This is the upper-triangular version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void XgemmUpper(const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if (get_group_id(1)*NWG < get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// Main entry point of the kernel. This is the lower-triangular version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void XgemmLower(const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (get_group_id(1)*NWG > get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= +// If not using a triangular version, include the regular kernel +#else + +// Main entry point of the kernel. This is the regular full version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= -- cgit v1.2.3