From 61f489e370c56075e166caff6d1ad671ca6787b9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 2 Oct 2016 15:06:59 +0200 Subject: Split the GEMM direct kernel into two files; set the default tuning target to 256-256-256 --- src/kernels/level3/xgemm_direct.opencl | 470 --------------------------- src/kernels/level3/xgemm_direct_part1.opencl | 294 +++++++++++++++++ src/kernels/level3/xgemm_direct_part2.opencl | 207 ++++++++++++ src/routines/level3/xgemm.cpp | 3 +- src/tuning/kernels/xgemm_direct.cpp | 9 +- 5 files changed, 508 insertions(+), 475 deletions(-) delete mode 100644 src/kernels/level3/xgemm_direct.opencl create mode 100644 src/kernels/level3/xgemm_direct_part1.opencl create mode 100644 src/kernels/level3/xgemm_direct_part2.opencl diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl deleted file mode 100644 index 75618e8c..00000000 --- a/src/kernels/level3/xgemm_direct.opencl +++ /dev/null @@ -1,470 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any -// pre and and post-processing kernels. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. Note that all parameters here have a -// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. -#ifndef WGD - #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) -#endif -#ifndef MDIMCD - #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) -#endif -#ifndef NDIMCD - #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) -#endif -#ifndef MDIMAD - #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD -#endif -#ifndef NDIMBD - #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD -#endif -#ifndef KWID - #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) -#endif -#ifndef VWMD - #define VWMD 1 // Vector width of matrices A and C -#endif -#ifndef VWND - #define VWND 1 // Vector width of matrix B -#endif -#ifndef PADA - #define PADA 1 // Local memory padding for matrix A -#endif -#ifndef PADB - #define PADB 1 // Local memory padding for matrix B -#endif - -// Helper parameters based on the above tuning parameters -#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) -#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) -#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD -#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD -#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) -#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) -#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) -#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) - -// ================================================================================================= - -// Data-widths in dimension M -#if VWMD == 1 - typedef real realMD; -#elif VWMD == 2 - typedef real2 realMD; -#elif VWMD == 4 - typedef real4 realMD; -#elif VWMD == 8 - typedef real8 realMD; -#elif VWMD == 16 - typedef real16 realMD; -#endif - -// Data-widths in dimension N -#if VWND == 1 - typedef real realND; -#elif VWND == 2 - typedef real2 realND; -#elif VWND == 4 - typedef real4 realND; -#elif VWND == 8 - typedef real8 realND; -#elif VWND == 16 - typedef real16 realND; -#endif - -// ================================================================================================= - -// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for -// caching the A input matrix. -inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, - const int a_ld, const int a_offset, const int kwg, - const int a_transpose, const int a_conjugate) { - #if MDIMCD == MDIMAD - const int la0 = get_local_id(0); - const int la1 = get_local_id(1); - #else - const int tid = get_local_id(0) + MDIMCD*get_local_id(1); - const int la0 = tid % MDIMAD; - const int la1 = tid / MDIMAD; - #endif - #pragma unroll - for (int mia=0; mia local (matrix A and B) - GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); - GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); - barrier(CLK_LOCAL_MEM_FENCE); - - // Loops over all workitem tiles, unrolled by a factor KWID - for (int pwi=0; pwi private (matrix A) - LocalToPrivateDirectA(alm, apm, kg, a_transpose); - - // Loads data: local --> private (matrix B) - LocalToPrivateDirectB(blm, bpm, kg, b_transpose); - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulateDirect(cpm, apm, bpm); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - // Loop over the remaining part (incomplete tile in K-dimension) - for (; kwg < kSizeK; ++kwg) { - const int idk = kwg; - - // Loads A into register memory - #pragma unroll - for (int mi=0; mi +// +// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any +// pre and and post-processing kernels. +// +// This kernel is seperated into three files. This is part 1 out of 2. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. Note that all parameters here have a +// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel. +#ifndef WGD + #define WGD 8 // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64) +#endif +#ifndef MDIMCD + #define MDIMCD 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMCD + #define NDIMCD 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMAD + #define MDIMAD 8 // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#endif +#ifndef NDIMBD + #define NDIMBD 8 // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#endif +#ifndef KWID + #define KWID 1 // Unroll factor of the WGD loop (smaller or equal than WGD) +#endif +#ifndef VWMD + #define VWMD 1 // Vector width of matrices A and C +#endif +#ifndef VWND + #define VWND 1 // Vector width of matrix B +#endif +#ifndef PADA + #define PADA 1 // Local memory padding for matrix A +#endif +#ifndef PADB + #define PADB 1 // Local memory padding for matrix B +#endif + +// Helper parameters based on the above tuning parameters +#define MWID (WGD/MDIMCD) // Work per work-item (M-dimension) +#define NWID (WGD/NDIMCD) // Work per work-item (N-dimension) +#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD +#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD +#define MWAD (WGD/MDIMAD) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWAD (WGD/KDIMAD) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWBD (WGD/KDIMBD) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWBD (WGD/NDIMBD) // Amount of loads-per-thread for matrix B (N-dimension) + +// ================================================================================================= + +// Data-widths in dimension M +#if VWMD == 1 + typedef real realMD; +#elif VWMD == 2 + typedef real2 realMD; +#elif VWMD == 4 + typedef real4 realMD; +#elif VWMD == 8 + typedef real8 realMD; +#elif VWMD == 16 + typedef real16 realMD; +#endif + +// Data-widths in dimension N +#if VWND == 1 + typedef real realND; +#elif VWND == 2 + typedef real2 realND; +#elif VWND == 4 + typedef real4 realND; +#elif VWND == 8 + typedef real8 realND; +#elif VWND == 16 + typedef real16 realND; +#endif + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm, + const int a_ld, const int a_offset, const int kwg, + const int a_transpose, const int a_conjugate) { + #if MDIMCD == MDIMAD + const int la0 = get_local_id(0); + const int la1 = get_local_id(1); + #else + const int tid = get_local_id(0) + MDIMCD*get_local_id(1); + const int la0 = tid % MDIMAD; + const int la1 = tid / MDIMAD; + #endif + #pragma unroll + for (int mia=0; mia +// +// This is part 2 of 2 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication +// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm +inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID], + const int kSizeM, const int kSizeN, + const real alpha, const real beta, + const int c_ld, const int c_offset, const int c_transpose) { + #pragma unroll + for (int ni=0; ni local (matrix A and B) + GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate); + GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate); + barrier(CLK_LOCAL_MEM_FENCE); + + // Loops over all workitem tiles, unrolled by a factor KWID + for (int pwi=0; pwi private (matrix A) + LocalToPrivateDirectA(alm, apm, kg, a_transpose); + + // Loads data: local --> private (matrix B) + LocalToPrivateDirectB(blm, bpm, kg, b_transpose); + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulateDirect(cpm, apm, bpm); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + // Loop over the remaining part (incomplete tile in K-dimension) + for (; kwg < kSizeK; ++kwg) { + const int idk = kwg; + + // Loads A into register memory + #pragma unroll + for (int mi=0; mi::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" - #include "../../kernels/level3/xgemm_direct.opencl" + #include "../../kernels/level3/xgemm_direct_part1.opencl" + #include "../../kernels/level3/xgemm_direct_part2.opencl" ; } diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 6ab6d1f0..c3864348 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -33,7 +33,8 @@ class TuneXgemmDirect { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_direct.opencl" + #include "../src/kernels/level3/xgemm_direct_part1.opencl" + #include "../src/kernels/level3/xgemm_direct_part2.opencl" ; } @@ -46,9 +47,9 @@ class TuneXgemmDirect { static void TestValidArguments(const Arguments &) { } // Sets the default values for the arguments - static size_t DefaultM() { return 128; } - static size_t DefaultN() { return 128; } - static size_t DefaultK() { return 128; } + static size_t DefaultM() { return 256; } + static size_t DefaultN() { return 256; } + static size_t DefaultK() { return 256; } static double DefaultFraction() { return (V==1) ? 1.0 : 16.0; } // test all or sample randomly static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging -- cgit v1.2.3