From b4d3a50f1952f532383f429b706269b49559407c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 10 Dec 2017 16:09:09 +0100 Subject: Split GEMM kernel in 4 files instead of 3 due to MSVC 2013 string length limit --- src/kernels/level3/xgemm_part1.opencl | 2 +- src/kernels/level3/xgemm_part2.opencl | 2 +- src/kernels/level3/xgemm_part3.opencl | 119 +---------------------------- src/kernels/level3/xgemm_part4.opencl | 140 ++++++++++++++++++++++++++++++++++ src/routines/level3/xgemm.cpp | 1 + src/routines/level3/xher2k.cpp | 1 + src/routines/level3/xherk.cpp | 1 + src/routines/level3/xsyr2k.cpp | 1 + src/routines/level3/xsyrk.cpp | 1 + src/routines/levelx/xgemmbatched.cpp | 1 + src/tuning/kernels/xgemm.cpp | 1 + 11 files changed, 150 insertions(+), 120 deletions(-) create mode 100644 src/kernels/level3/xgemm_part4.opencl (limited to 'src') diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 053eb721..4e1c3e61 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -31,7 +31,7 @@ // o-------o o-----o // // -// This kernel is seperated into three files. This is part 1 out of 3. +// This kernel is separated into three files. This is part 1 out of 4. // // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 14a0493a..1c7a940b 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This is part 2 of 3 of the GEMM kernel. See part 1 for more information. +// This is part 2 of 4 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 157c1326..1483c26e 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This is part 3 of 3 of the GEMM kernel. See part 1 for more information. +// This is part 3 of 4 of the GEMM kernel. See part 1 for more information. // // ================================================================================================= @@ -161,123 +161,6 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, StoreResults(cgm, cpm, kSizeM, alpha, beta); } -// ================================================================================================= -// The upper-triangular and lower-triangular kernels are only used in special cases -#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) - -// Main entry point of the kernel. This is the upper-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -void XgemmUpper(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Skip these threads if they do not contain threads contributing to the upper-triangle - if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in global memory - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); - #endif -} - -// Main entry point of the kernel. This is the lower-triangular version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -void XgemmLower(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in global memory - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); - #endif -} - -// ================================================================================================= -// If not using a triangular version, include the regular kernel -#else - -// Main entry point of the kernel. This is the regular full version. -__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm, - const int b_offset, const int c_offset) { - const real alpha = GetRealArg(arg_alpha); - const real beta = GetRealArg(arg_beta); - - // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) - bgm = &bgm[b_offset]; - cgm = &cgm[c_offset]; - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in global memory - #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); - #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); - #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); - #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); - #endif -} - -#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/kernels/level3/xgemm_part4.opencl b/src/kernels/level3/xgemm_part4.opencl new file mode 100644 index 00000000..e581cd84 --- /dev/null +++ b/src/kernels/level3/xgemm_part4.opencl @@ -0,0 +1,140 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This is part 4 of 4 of the GEMM kernel. See part 1 for more information. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + +// Main entry point of the kernel. This is the upper-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmUpper(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in global memory + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); + #endif +} + +// Main entry point of the kernel. This is the lower-triangular version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmLower(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in global memory + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); + #endif +} + +// ================================================================================================= +// If not using a triangular version, include the regular kernel +#else + +// Main entry point of the kernel. This is the regular full version. +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm, + const int b_offset, const int c_offset) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) + bgm = &bgm[b_offset]; + cgm = &cgm[c_offset]; + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in global memory + #if SA == 1 && SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm, blm); + #elif SA == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, alm); + #elif SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta, blm); + #else + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, alpha, beta); + #endif +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 94392dd0..edba1f00 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -41,6 +41,7 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" }) { } diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index 2aed2781..3010dea8 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -31,6 +31,7 @@ Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" }) { } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index d982859e..38a8420f 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -31,6 +31,7 @@ Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" }) { } diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 7900eb74..5ff4769d 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -31,6 +31,7 @@ Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" }) { } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 9588c28c..79e3a4ab 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -31,6 +31,7 @@ Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" }) { } diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 152e7194..8ce2dedc 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -38,6 +38,7 @@ XgemmBatched::XgemmBatched(Queue &queue, EventPointer event, const std::strin #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" #include "../../kernels/level3/xgemm_part3.opencl" + #include "../../kernels/level3/xgemm_part4.opencl" , // separated in multiple parts to prevent C1091 in MSVC 2013 #include "../../kernels/level3/xgemm_batched.opencl" #include "../../kernels/level3/xgemm_direct_batched.opencl" diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 16e32988..d38ce077 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -52,6 +52,7 @@ class TuneXgemm { #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" +#include "../src/kernels/level3/xgemm_part4.opencl" ; // Buffer sizes -- cgit v1.2.3