summaryrefslogtreecommitdiff
path: root/src/kernels/level3/xgemm_part1.opencl
diff options
context:
space:
mode:
authorTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-14 19:50:47 -0400
committerTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-14 19:50:47 -0400
commit7709a7308bce5492e06d8867a4dd9dff5b2ba950 (patch)
treeed35acf41257752ec165480c2298edf17080da4c /src/kernels/level3/xgemm_part1.opencl
parent36093429fd444d0a1fc7de25dfaf7f2f775cfabc (diff)
Applied feedback from Cedric from first pull request
Diffstat (limited to 'src/kernels/level3/xgemm_part1.opencl')
-rw-r--r--src/kernels/level3/xgemm_part1.opencl21
1 files changed, 11 insertions, 10 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 9e483b3e..32386312 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,26 +114,27 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-#ifndef NVIDIA_WARPS_AS_SUBGROUPS
- #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
#endif
-#ifndef NVIDIA_POST_VOLTA
- #define NVIDIA_POST_VOLTA 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
#endif
-#ifndef INTEL_SUBGROUP_EXTENSION
- #define INTEL_SUBGROUP_EXTENSION 0
+#ifndef SUBGROUP_SHUFFLING_INTEL
+ #define SUBGROUP_SHUFFLING_INTEL 0
#endif
-//#ifndef USE_SUBGROUP_SHUFFLING
+#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
-//#endif
+#endif
// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
-#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \
+ SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
#define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
#endif