diff options
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 80a60107..cc03696e 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -126,13 +126,15 @@ R"( #endif // Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) -#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL +#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL == 1 #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) -#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA) - #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs +#if USE_SUBGROUP_SHUFFLING == 1 + #if SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 + #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs + #endif #endif #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE |