summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-07-19 16:36:12 +0200
committerCNugteren <web@cedricnugteren.nl>2015-07-19 16:36:12 +0200
commita0f0f6c8ceac5e0dac9b4460708c01f6b72467c4 (patch)
treeeb11393e55a4815e5d04a884880163859cfba3ea /src
parent48e2e96f1ba44e9b12a2449390bbbbd5e02777a3 (diff)
Triangular GEMM kernels are only compiled when needed
Diffstat (limited to 'src')
-rw-r--r--src/kernels/xgemm.opencl69
1 files changed, 36 insertions, 33 deletions
diff --git a/src/kernels/xgemm.opencl b/src/kernels/xgemm.opencl
index b689fa1e..8db0f557 100644
--- a/src/kernels/xgemm.opencl
+++ b/src/kernels/xgemm.opencl
@@ -561,14 +561,21 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
}
// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-// Main entry point of the kernel. This is the regular full version.
+// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
- const real alpha, const real beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
+__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+ const real alpha, const real beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+
+ // Skip these threads if they do not contain threads contributing to the upper-triangle
+ if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+ return;
+ }
// Allocates workgroup-private memory (local memory)
#if SA == 1
@@ -581,31 +588,29 @@ __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeM, alpha, beta);
+ StoreResults(cgm, cpm, kSizeN, alpha, beta);
}
-// =================================================================================================
-
-// Main entry point of the kernel. This is the upper-triangular version.
+// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+__kernel void XgemmLower(const int kSizeN, const int kSizeK,
const real alpha, const real beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
- // Skip these threads if they do not contain threads contributing to the upper-triangle
- if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
+ // Skip these threads if they do not contain threads contributing to the lower-triangle
+ if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
return;
}
@@ -634,19 +639,16 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
}
// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
-// Main entry point of the kernel. This is the lower-triangular version.
+// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
- const real alpha, const real beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
-
- // Skip these threads if they do not contain threads contributing to the lower-triangle
- if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
- return;
- }
+__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real alpha, const real beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
// Allocates workgroup-private memory (local memory)
#if SA == 1
@@ -659,19 +661,20 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
// Computes the matrix-multiplication and stores the result in register memory
realM cpm[NWI][MWI/VWM];
#if SA == 1 && SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
#elif SA == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
#elif SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
#else
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
#endif
// Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeN, alpha, beta);
+ StoreResults(cgm, cpm, kSizeM, alpha, beta);
}
+#endif
// =================================================================================================
// End of the C++11 raw string literal