From a0f0f6c8ceac5e0dac9b4460708c01f6b72467c4 Mon Sep 17 00:00:00 2001
From: CNugteren <web@cedricnugteren.nl>
Date: Sun, 19 Jul 2015 16:36:12 +0200
Subject: Triangular GEMM kernels are only compiled when needed

---
 src/kernels/xgemm.opencl | 69 +++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 33 deletions(-)

(limited to 'src')

diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
index b689fa1e..8db0f557 100644
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@@ -561,14 +561,21 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 }
 
 // =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
 
-// Main entry point of the kernel. This is the regular full version.
+// Main entry point of the kernel. This is the upper-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+                         const real alpha, const real beta,
+                         const __global realM* restrict agm,
+                         const __global realN* restrict bgm,
+                         __global realM* cgm) {
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+    return;
+  }
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
@@ -581,31 +588,29 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
   // Computes the matrix-multiplication and stores the result in register memory
   realM cpm[NWI][MWI/VWM];
   #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
   #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
   #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
   #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
   #endif
 
   // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
 }
 
-// =================================================================================================
-
-// Main entry point of the kernel. This is the upper-triangular version.
+// Main entry point of the kernel. This is the lower-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
                          const real alpha, const real beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
 
-  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
     return;
   }
 
@@ -634,19 +639,16 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 }
 
 // =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
 
-// Main entry point of the kernel. This is the lower-triangular version.
+// Main entry point of the kernel. This is the regular full version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const real alpha, const real beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-
-  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
-    return;
-  }
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+                    const real alpha, const real beta,
+                    const __global realM* restrict agm,
+                    const __global realN* restrict bgm,
+                    __global realM* cgm) {
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
@@ -659,19 +661,20 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
   // Computes the matrix-multiplication and stores the result in register memory
   realM cpm[NWI][MWI/VWM];
   #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
   #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
   #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
   #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
   #endif
 
   // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 
+#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
-- 
cgit v1.2.3