summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_part1.opencl
diff options
context:
space:
mode:
authorTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-11 15:12:22 -0400
committerTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-11 15:12:22 -0400
commit7f2e98a1406da6c5293f0c988df95edc246ef88d (patch)
tree5fdb94393d54cff495f7f2e6a2f0edc6df1eebd8 /src/kernels/level3/xgemm_part1.opencl
parent7bae54f61f8a2b589421afd57c9da6c8775155ef (diff)
added inline ptx to support shuffle on Nvidia GPUs
Diffstat (limited to 'src/kernels/level3/xgemm_part1.opencl')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl24
1 files changed, 20 insertions, 4 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..9e483b3e 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#ifndef USE_SUBGROUP_SHUFFLING
- #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
+#ifndef NVIDIA_WARPS_AS_SUBGROUPS
+ #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#endif
+#ifndef NVIDIA_POST_VOLTA
+ #define NVIDIA_POST_VOLTA 0
#endif
-#if USE_SUBGROUP_SHUFFLING == 1
+#ifndef INTEL_SUBGROUP_EXTENSION
+ #define INTEL_SUBGROUP_EXTENSION 0
+#endif
+//#ifndef USE_SUBGROUP_SHUFFLING
+ #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
+//#endif
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+ #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold