summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_part1.opencl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/level3/xgemm_part1.opencl')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl20
1 files changed, 18 insertions, 2 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..3cfc5dfb 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
+#endif
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
+#endif
+#ifndef SUBGROUP_SHUFFLING_INTEL
+ #define SUBGROUP_SHUFFLING_INTEL 0
+#endif
#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
#endif
-#if USE_SUBGROUP_SHUFFLING == 1
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
+ #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold