Applied feedback from Cedric from first pull request

author: Tyler Sorensen <tylersorensen3221@hotmail.com> 2018-07-14 19:50:47 -0400
committer: Tyler Sorensen <tylersorensen3221@hotmail.com> 2018-07-14 19:50:47 -0400
commit: 7709a7308bce5492e06d8867a4dd9dff5b2ba950 (patch)
tree: ed35acf41257752ec165480c2298edf17080da4c /src/kernels
parent: 36093429fd444d0a1fc7de25dfaf7f2f775cfabc (diff)
2 files changed, 16 insertions, 15 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 9e483b3e..32386312 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,26 +114,27 @@ R"(
   #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
 #endif
 
-#ifndef NVIDIA_WARPS_AS_SUBGROUPS
-  #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+  #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
 #endif
-#ifndef NVIDIA_POST_VOLTA
-  #define NVIDIA_POST_VOLTA 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+  #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
 #endif
-#ifndef INTEL_SUBGROUP_EXTENSION
-  #define INTEL_SUBGROUP_EXTENSION 0
+#ifndef SUBGROUP_SHUFFLING_INTEL
+  #define SUBGROUP_SHUFFLING_INTEL 0
 #endif
-//#ifndef USE_SUBGROUP_SHUFFLING
+#ifndef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
-//#endif
+#endif
 
 // Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
   #define SUBGROUP_SIZE 8              // Assumes subgroup size is always 8 on Intel GPUs
 #endif
 
 // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
-#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \ 
+                                    SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
   #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 32 on NVIDIA GPUs
 #endif
 
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 8e20b1b8..35ec735c 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -24,11 +24,11 @@ R"(
 INLINE_FUNC int clblast_get_sub_group_local_id() {
   
   // Intel extension 
-  #if INTEL_SUBGROUP_EXTENSION == 1
+  #if SUBGROUP_SHUFFLING_INTEL == 1
   return get_sub_group_local_id();
   
   // Nvidia inline PTX
-  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
   int ret;
   asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
   return ret;
@@ -38,14 +38,14 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
 INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
   
   // Intel extension 
-  #if INTEL_SUBGROUP_EXTENSION == 1
+  #if SUBGROUP_SHUFFLING_INTEL == 1
   return intel_sub_group_shuffle(reg, src);
   
   // Nvidia inline PTX
   // Volta and later requires .sync shuffle instructions with an extra mask arg
-  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
   realN ret;
-    #if NVIDIA_POST_VOLTA == 1
+    #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
     asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
     #else
     asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
author	Tyler Sorensen <tylersorensen3221@hotmail.com>	2018-07-14 19:50:47 -0400
committer	Tyler Sorensen <tylersorensen3221@hotmail.com>	2018-07-14 19:50:47 -0400
commit	7709a7308bce5492e06d8867a4dd9dff5b2ba950 (patch)
tree	ed35acf41257752ec165480c2298edf17080da4c /src/kernels
parent	36093429fd444d0a1fc7de25dfaf7f2f775cfabc (diff)