added inline ptx to support shuffle on Nvidia GPUs

author: Tyler Sorensen <tylersorensen3221@hotmail.com> 2018-07-11 15:12:22 -0400
committer: Tyler Sorensen <tylersorensen3221@hotmail.com> 2018-07-11 15:12:22 -0400
commit: 7f2e98a1406da6c5293f0c988df95edc246ef88d (patch)
tree: 5fdb94393d54cff495f7f2e6a2f0edc6df1eebd8 /src/kernels
parent: 7bae54f61f8a2b589421afd57c9da6c8775155ef (diff)
2 files changed, 60 insertions, 6 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..9e483b3e 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
   #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
 #endif
 
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#ifndef USE_SUBGROUP_SHUFFLING
-  #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
+#ifndef NVIDIA_WARPS_AS_SUBGROUPS
+  #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#endif
+#ifndef NVIDIA_POST_VOLTA
+  #define NVIDIA_POST_VOLTA 0
 #endif
-#if USE_SUBGROUP_SHUFFLING == 1
+#ifndef INTEL_SUBGROUP_EXTENSION
+  #define INTEL_SUBGROUP_EXTENSION 0
+#endif
+//#ifndef USE_SUBGROUP_SHUFFLING
+  #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
+//#endif
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
   #define SUBGROUP_SIZE 8              // Assumes subgroup size is always 8 on Intel GPUs
 #endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+  #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
 #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
   #undef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Disables subgroups in case the assumptions don't hold
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index c3920cb5..8e20b1b8 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,6 +17,44 @@ R"(
 
 // =================================================================================================
 
+// A common interface for subgroup functions
+
+#if USE_SUBGROUP_SHUFFLING == 1
+
+INLINE_FUNC int clblast_get_sub_group_local_id() {
+  
+  // Intel extension 
+  #if INTEL_SUBGROUP_EXTENSION == 1
+  return get_sub_group_local_id();
+  
+  // Nvidia inline PTX
+  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  int ret;
+  asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
+  return ret;
+  #endif 
+}
+
+INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
+  
+  // Intel extension 
+  #if INTEL_SUBGROUP_EXTENSION == 1
+  return intel_sub_group_shuffle(reg, src);
+  
+  // Nvidia inline PTX
+  // Volta and later requires .sync shuffle instructions with an extra mask arg
+  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  realN ret;
+    #if NVIDIA_POST_VOLTA == 1
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
+    #else
+    asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
+    #endif
+  return ret;
+  #endif
+}
+#endif
+
 // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
 INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                            const __global realM* restrict agm, const __global realN* restrict bgm,
@@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
         #elif GEMMK == 1
           // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
           #if USE_SUBGROUP_SHUFFLING == 1
-            const int _ni = get_sub_group_local_id();
+            const int _ni = clblast_get_sub_group_local_id();
             #pragma unroll
             for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
               apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
@@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
               for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
                 const int index =  _ni * (MWI/VWM) + _mi;
                 #if USE_SUBGROUP_SHUFFLING == 1
-                  const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+                  const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
                 #else
                   const realN aval = apm[_ni * (KREG/VWN) + _ki];
                 #endif
author	Tyler Sorensen <tylersorensen3221@hotmail.com>	2018-07-11 15:12:22 -0400
committer	Tyler Sorensen <tylersorensen3221@hotmail.com>	2018-07-11 15:12:22 -0400
commit	7f2e98a1406da6c5293f0c988df95edc246ef88d (patch)
tree	5fdb94393d54cff495f7f2e6a2f0edc6df1eebd8 /src/kernels
parent	7bae54f61f8a2b589421afd57c9da6c8775155ef (diff)