summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-04-24 21:32:42 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-04-24 21:32:42 +0200
commit2965b87dda80ca22bf12527755ef9f3cb5fed46b (patch)
treef8ccc7a509eb36fd96908ba53060baa326238f8c /src/kernels
parent2b1e0295e6dba8d8d9f85ca65b6232a89e6cceae (diff)
Added Intel subgroup shuffle support to the 2D register caching GEMM kernel
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl12
-rw-r--r--src/kernels/level3/xgemm_part3.opencl34
2 files changed, 38 insertions, 8 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index d15dafc8..99d64c91 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,6 +114,18 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#ifndef USE_SUBGROUP_SHUFFLING
+ #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
+#endif
+#if USE_SUBGROUP_SHUFFLING == 1
+ #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
+#endif
+#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
+ #undef USE_SUBGROUP_SHUFFLING
+ #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
+#endif
+
// =================================================================================================
// Data-widths in dimension M
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index c25c3001..c3920cb5 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -37,8 +37,13 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#pragma promote_to_registers
realN bpm[NWI/VWN]; // 1 * NWI
#elif GEMMK == 1
- #pragma promote_to_registers
- realN apm[NWI*(KREG/VWN)]; // NWI * KREG
+ #if USE_SUBGROUP_SHUFFLING == 1
+ #pragma promote_to_registers
+ realN apm[KREG/VWN]; // KREG (subgroup shuffling in NWI dimension)
+ #else
+ #pragma promote_to_registers
+ realN apm[NWI*(KREG/VWN)]; // NWI * KREG
+ #endif
#pragma promote_to_registers
realM bpm[KREG*(MWI/VWM)]; // KREG * MWI
#endif
@@ -123,14 +128,23 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#endif
}
#elif GEMMK == 1
- // Loads data: 2D global --> 2D private (matrix A)
- #pragma unroll
- for (int _ni = 0; _ni < NWI; _ni += 1) {
+ // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
+ #if USE_SUBGROUP_SHUFFLING == 1
+ const int _ni = get_sub_group_local_id();
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
- apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
+ apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
}
- }
+ // Loads data: 2D global --> 2D private (matrix A)
+ #else
+ #pragma unroll
+ for (int _ni = 0; _ni < NWI; _ni += 1) {
+ #pragma unroll
+ for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
+ apm[_ni * (KREG/VWN) + _ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
+ }
+ }
+ #endif
#endif
// Performs the accumulation (Cpm += Apm * Bpm)
@@ -187,7 +201,11 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
const int index = _ni * (MWI/VWM) + _mi;
- const realN aval = apm[_ni * (KREG/VWN) + _ki];
+ #if USE_SUBGROUP_SHUFFLING == 1
+ const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+ #else
+ const realN aval = apm[_ni * (KREG/VWN) + _ki];
+ #endif
#if VWN == 1
cpm[index] = MultiplyAddVector(cpm[index], bpm[(VWN * _ki + 0) * (MWI/VWM) + _mi], aval);
#elif VWN == 2