summaryrefslogtreecommitdiff
path: root/src/kernels
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-07-29 10:26:34 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-07-29 10:26:34 +0200
commit5903820ba2736ec4a5f338c2325f59f511b4a0e8 (patch)
tree08eaca8cde2e45682807dcfa6db8252ee520d1bf /src/kernels
parent1c9a74147073234da953b84f0bbafefbcf5ffb4f (diff)
parentdda1e567f872d3d89f2f7cd890fb5b29ff98537c (diff)
Merge branch 'master' into CLBlast-267-convgemm
Diffstat (limited to 'src/kernels')
-rw-r--r--src/kernels/common.opencl2
-rw-r--r--src/kernels/level3/xgemm_part1.opencl20
-rw-r--r--src/kernels/level3/xgemm_part3.opencl46
3 files changed, 61 insertions, 7 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 4a476a8b..0ad38919 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -260,7 +260,7 @@ R"(
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
// More details: https://github.com/CNugteren/CLBlast/issues/53
-#if USE_STAGGERED_INDICES == 1
+#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
INLINE_FUNC int GetGroupIDFlat() {
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
}
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..3cfc5dfb 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
+#endif
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
+#endif
+#ifndef SUBGROUP_SHUFFLING_INTEL
+ #define SUBGROUP_SHUFFLING_INTEL 0
+#endif
#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
#endif
-#if USE_SUBGROUP_SHUFFLING == 1
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
+ #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index c3920cb5..90de0b3b 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,6 +17,44 @@ R"(
// =================================================================================================
+// A common interface for subgroup functions
+
+#if USE_SUBGROUP_SHUFFLING == 1
+
+INLINE_FUNC int clblast_get_sub_group_local_id() {
+
+ // Intel extension
+ #if SUBGROUP_SHUFFLING_INTEL == 1
+ return get_sub_group_local_id();
+
+ // Nvidia inline PTX
+ #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
+ int ret;
+ asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
+ return ret;
+ #endif
+}
+
+INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
+
+ // Intel extension
+ #if SUBGROUP_SHUFFLING_INTEL == 1
+ return intel_sub_group_shuffle(reg, src);
+
+ // Nvidia inline PTX
+ // Volta and later requires .sync shuffle instructions with an extra mask arg
+ #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
+ realN ret;
+ #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
+ asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
+ #else
+ asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
+ #endif
+ return ret;
+ #endif
+}
+#endif
+
// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
@@ -53,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#if GEMMK == 1
const __global real* restrict a_ptr = (const __global real* restrict) &agm[0];
const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0];
- const int tid_x = get_global_id(0);
- const int tid_y = get_global_id(1);
+ const int tid_x = get_local_id(0) + MDIMC * GetGroupID0();
+ const int tid_y = get_local_id(1) + NDIMC * GetGroupID1();
#endif
// Combined thread identifier (volatile to disable caching)
@@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#elif GEMMK == 1
// Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
#if USE_SUBGROUP_SHUFFLING == 1
- const int _ni = get_sub_group_local_id();
+ const int _ni = clblast_get_sub_group_local_id();
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
@@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
const int index = _ni * (MWI/VWM) + _mi;
#if USE_SUBGROUP_SHUFFLING == 1
- const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+ const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif