summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-09-12 22:13:16 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-09-12 22:13:16 +0200
commit4ce584a01404055fdb23f78b4ac359394b559ea1 (patch)
tree079e72c7f35e6330213f694a059982fc97e99aef /src
parent9fb7a0efe1687f7441c169be9a49a965fa84f493 (diff)
Split the XGEMM kernel further up: now in 3 parts. This is done because MSVC can't handle long strings
Diffstat (limited to 'src')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl2
-rw-r--r--src/kernels/level3/xgemm_part2.opencl208
-rw-r--r--src/kernels/level3/xgemm_part3.opencl229
-rw-r--r--src/routines/level3/xgemm.cpp1
-rw-r--r--src/routines/level3/xher2k.cpp1
-rw-r--r--src/routines/level3/xherk.cpp1
-rw-r--r--src/routines/level3/xsyr2k.cpp1
-rw-r--r--src/routines/level3/xsyrk.cpp1
-rw-r--r--src/tuning/kernels/xgemm.cpp1
9 files changed, 237 insertions, 208 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 1ad0a558..d0ce06ad 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -31,7 +31,7 @@
// o-------o o-----o
//
//
-// This kernel is seperated into two files. This is part 1 out of 2.
+// This kernel is seperated into three files. This is part 1 out of 3.
//
// =================================================================================================
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index faf17e49..e8234a29 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
//
// =================================================================================================
@@ -227,212 +227,6 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
// =================================================================================================
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
- const __global realM* restrict agm, const __global realN* restrict bgm,
- __global realM* cgm, realM cpm[NWI][MWI/VWM]
- #if SA == 1 && SB == 1
- , __local realM* alm, __local realN* blm
- #elif SA == 1
- , __local realM* alm
- #elif SB == 1
- , __local realN* blm
- #endif
- ) {
-
- // Allocates workitem-private memory (registers)
- realM apm[MWI/VWM];
- realN bpm[NWI/VWN];
-
- // Combined thread identifier (volatile to disable caching)
- #if SA == 1 || SB == 1
- volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
- #endif
-
- // Initializes the accumulation registers
- InitAccRegisters(cpm);
-
- // Loops over all workgroup tiles
- for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
-
- // Loads data: off-chip --> local (matrix A)
- #if SA == 1
- GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
- #endif
- // Loads data: off-chip --> local (matrix B)
- #if SB == 1
- GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
- #endif
- #if SA == 1 || SB == 1
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
-
- // Loops over all workitem tiles, unrolled by a factor KWI
- for (int pwi=0; pwi<KWG; pwi+=KWI) {
- #pragma unroll
- for (int pit=0; pit<KWI; ++pit) {
- #if SA == 0 || SB == 0
- int idk = kwg + pwi + pit;
- #endif
- #if SA == 1 || SB == 1
- int kg = pwi+pit;
- #endif
-
- // Loads data: local --> private (matrix A)
- #if SA == 1
- LocalToPrivateA(alm, apm, kg);
- // Loads data: off-chip --> private (matrix A)
- #else
- GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
- #endif
-
- // Loads data: local --> private (matrix B)
- #if SB == 1
- LocalToPrivateB(blm, bpm, kg);
- // Loads data: off-chip --> private (matrix B)
- #else
- GlobalToPrivateB(bgm, bpm, kSizeN, idk);
- #endif
-
- // Performs the accumulation (Cpm += Apm * Bpm)
- MultiplyAccumulate(cpm, apm, bpm);
- }
- }
- #if SA == 1 || SB == 1
- barrier(CLK_LOCAL_MEM_FENCE);
- #endif
- }
- #if GLOBAL_MEM_FENCE == 1
- barrier(CLK_GLOBAL_MEM_FENCE);
- #endif
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void XgemmUpper(const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Skip these threads if they do not contain threads contributing to the upper-triangle
- if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
- return;
- }
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void XgemmLower(const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Skip these threads if they do not contain threads contributing to the lower-triangle
- if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
- return;
- }
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
- const real_arg arg_alpha,
- const real_arg arg_beta,
- const __global realM* restrict agm,
- const __global realN* restrict bgm,
- __global realM* cgm) {
- const real alpha = GetRealArg(arg_alpha);
- const real beta = GetRealArg(arg_beta);
-
- // Allocates workgroup-private memory (local memory)
- #if SA == 1
- __local realM alm[KWG * MWG/VWM];
- #endif
- #if SB == 1
- __local realN blm[KWG * NWG/VWN];
- #endif
-
- // Computes the matrix-multiplication and stores the result in register memory
- realM cpm[NWI][MWI/VWM];
- #if SA == 1 && SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
- #elif SA == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
- #elif SB == 1
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
- #else
- XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
- #endif
-
- // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
- StoreResults(cgm, cpm, kSizeM, alpha, beta);
-}
-
-#endif
-// =================================================================================================
-
// End of the C++11 raw string literal
)"
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
new file mode 100644
index 00000000..a5faef5a
--- /dev/null
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -0,0 +1,229 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+ const __global realM* restrict agm, const __global realN* restrict bgm,
+ __global realM* cgm, realM cpm[NWI][MWI/VWM]
+ #if SA == 1 && SB == 1
+ , __local realM* alm, __local realN* blm
+ #elif SA == 1
+ , __local realM* alm
+ #elif SB == 1
+ , __local realN* blm
+ #endif
+ ) {
+
+ // Allocates workitem-private memory (registers)
+ realM apm[MWI/VWM];
+ realN bpm[NWI/VWN];
+
+ // Combined thread identifier (volatile to disable caching)
+ #if SA == 1 || SB == 1
+ volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+ #endif
+
+ // Initializes the accumulation registers
+ InitAccRegisters(cpm);
+
+ // Loops over all workgroup tiles
+ for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+ // Loads data: off-chip --> local (matrix A)
+ #if SA == 1
+ GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+ #endif
+ // Loads data: off-chip --> local (matrix B)
+ #if SB == 1
+ GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+ #endif
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+
+ // Loops over all workitem tiles, unrolled by a factor KWI
+ for (int pwi=0; pwi<KWG; pwi+=KWI) {
+ #pragma unroll
+ for (int pit=0; pit<KWI; ++pit) {
+ #if SA == 0 || SB == 0
+ int idk = kwg + pwi + pit;
+ #endif
+ #if SA == 1 || SB == 1
+ int kg = pwi+pit;
+ #endif
+
+ // Loads data: local --> private (matrix A)
+ #if SA == 1
+ LocalToPrivateA(alm, apm, kg);
+ // Loads data: off-chip --> private (matrix A)
+ #else
+ GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+ #endif
+
+ // Loads data: local --> private (matrix B)
+ #if SB == 1
+ LocalToPrivateB(blm, bpm, kg);
+ // Loads data: off-chip --> private (matrix B)
+ #else
+ GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+ #endif
+
+ // Performs the accumulation (Cpm += Apm * Bpm)
+ MultiplyAccumulate(cpm, apm, bpm);
+ }
+ }
+ #if SA == 1 || SB == 1
+ barrier(CLK_LOCAL_MEM_FENCE);
+ #endif
+ }
+ #if GLOBAL_MEM_FENCE == 1
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ #endif
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Skip these threads if they do not contain threads contributing to the upper-triangle
+ if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+ return;
+ }
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Skip these threads if they do not contain threads contributing to the lower-triangle
+ if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+ return;
+ }
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+ const real_arg arg_alpha,
+ const real_arg arg_beta,
+ const __global realM* restrict agm,
+ const __global realN* restrict bgm,
+ __global realM* cgm) {
+ const real alpha = GetRealArg(arg_alpha);
+ const real beta = GetRealArg(arg_beta);
+
+ // Allocates workgroup-private memory (local memory)
+ #if SA == 1
+ __local realM alm[KWG * MWG/VWM];
+ #endif
+ #if SB == 1
+ __local realN blm[KWG * NWG/VWN];
+ #endif
+
+ // Computes the matrix-multiplication and stores the result in register memory
+ realM cpm[NWI][MWI/VWM];
+ #if SA == 1 && SB == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+ #elif SA == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+ #elif SB == 1
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+ #else
+ XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+ #endif
+
+ // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+ StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index fce59622..0b8e768f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -34,6 +34,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/convert_hermitian.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ #include "../../kernels/level3/xgemm_part3.opencl"
;
}
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index 1ba6080f..ba770065 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -31,6 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ #include "../../kernels/level3/xgemm_part3.opencl"
;
}
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 0fa1b7b1..3063f3bc 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -31,6 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ #include "../../kernels/level3/xgemm_part3.opencl"
;
}
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 5a90a5a2..158cd9e5 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -31,6 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ #include "../../kernels/level3/xgemm_part3.opencl"
;
}
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 46b96b76..e1a72ef6 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -31,6 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
#include "../../kernels/level3/transpose_pad.opencl"
#include "../../kernels/level3/xgemm_part1.opencl"
#include "../../kernels/level3/xgemm_part2.opencl"
+ #include "../../kernels/level3/xgemm_part3.opencl"
;
}
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 7c9ac76a..4cb7fd00 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -35,6 +35,7 @@ class TuneXgemm {
#include "../src/kernels/common.opencl"
#include "../src/kernels/level3/xgemm_part1.opencl"
#include "../src/kernels/level3/xgemm_part2.opencl"
+ #include "../src/kernels/level3/xgemm_part3.opencl"
;
}