From a0f0f6c8ceac5e0dac9b4460708c01f6b72467c4 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sun, 19 Jul 2015 16:36:12 +0200 Subject: Triangular GEMM kernels are only compiled when needed --- src/kernels/xgemm.opencl | 69 +++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl index b689fa1e..8db0f557 100644 --- a/src/kernels/xgemm.opencl +++ b/src/kernels/xgemm.opencl @@ -561,14 +561,21 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, } // ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) -// Main entry point of the kernel. This is the regular full version. +// Main entry point of the kernel. This is the upper-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { +__kernel void XgemmUpper(const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if (get_group_id(1)*NWG < get_group_id(0)*MWG) { + return; + } // Allocates workgroup-private memory (local memory) #if SA == 1 @@ -581,31 +588,29 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, // Computes the matrix-multiplication and stores the result in register memory realM cpm[NWI][MWI/VWM]; #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeM, alpha, beta); + StoreResults(cgm, cpm, kSizeN, alpha, beta); } -// ================================================================================================= - -// Main entry point of the kernel. This is the upper-triangular version. +// Main entry point of the kernel. This is the lower-triangular version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, +__kernel void XgemmLower(const int kSizeN, const int kSizeK, const real alpha, const real beta, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm) { - // Skip these threads if they do not contain threads contributing to the upper-triangle - if (get_group_id(1)*NWG < get_group_id(0)*MWG) { + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (get_group_id(1)*NWG > get_group_id(0)*MWG) { return; } @@ -634,19 +639,16 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK, } // ================================================================================================= +// If not using a triangular version, include the regular kernel +#else -// Main entry point of the kernel. This is the lower-triangular version. +// Main entry point of the kernel. This is the regular full version. __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (get_group_id(1)*NWG > get_group_id(0)*MWG) { - return; - } +__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { // Allocates workgroup-private memory (local memory) #if SA == 1 @@ -659,19 +661,20 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK, // Computes the matrix-multiplication and stores the result in register memory realM cpm[NWI][MWI/VWM]; #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); #endif // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); + StoreResults(cgm, cpm, kSizeM, alpha, beta); } +#endif // ================================================================================================= // End of the C++11 raw string literal -- cgit v1.2.3