diff options
author | Tyler Sorensen <tylersorensen3221@hotmail.com> | 2018-07-11 15:12:22 -0400 |
---|---|---|
committer | Tyler Sorensen <tylersorensen3221@hotmail.com> | 2018-07-11 15:12:22 -0400 |
commit | 7f2e98a1406da6c5293f0c988df95edc246ef88d (patch) | |
tree | 5fdb94393d54cff495f7f2e6a2f0edc6df1eebd8 /src/kernels | |
parent | 7bae54f61f8a2b589421afd57c9da6c8775155ef (diff) |
added inline ptx to support shuffle on Nvidia GPUs
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 24 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 42 |
2 files changed, 60 insertions, 6 deletions
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 99d64c91..9e483b3e 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -114,13 +114,29 @@ R"( #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif -// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) -#ifndef USE_SUBGROUP_SHUFFLING - #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs +#ifndef NVIDIA_WARPS_AS_SUBGROUPS + #define NVIDIA_WARPS_AS_SUBGROUPS 0 +#endif +#ifndef NVIDIA_POST_VOLTA + #define NVIDIA_POST_VOLTA 0 #endif -#if USE_SUBGROUP_SHUFFLING == 1 +#ifndef INTEL_SUBGROUP_EXTENSION + #define INTEL_SUBGROUP_EXTENSION 0 +#endif +//#ifndef USE_SUBGROUP_SHUFFLING + #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs +//#endif + +// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) +#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif + +// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) +#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS + #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs +#endif + #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE #undef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index c3920cb5..8e20b1b8 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -17,6 +17,44 @@ R"( // ================================================================================================= +// A common interface for subgroup functions + +#if USE_SUBGROUP_SHUFFLING == 1 + +INLINE_FUNC int clblast_get_sub_group_local_id() { + + // Intel extension + #if INTEL_SUBGROUP_EXTENSION == 1 + return get_sub_group_local_id(); + + // Nvidia inline PTX + #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + int ret; + asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; + #endif +} + +INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { + + // Intel extension + #if INTEL_SUBGROUP_EXTENSION == 1 + return intel_sub_group_shuffle(reg, src); + + // Nvidia inline PTX + // Volta and later requires .sync shuffle instructions with an extra mask arg + #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + realN ret; + #if NVIDIA_POST_VOLTA == 1 + asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src)); + #else + asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); + #endif + return ret; + #endif +} +#endif + // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, @@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #elif GEMMK == 1 // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups #if USE_SUBGROUP_SHUFFLING == 1 - const int _ni = get_sub_group_local_id(); + const int _ni = clblast_get_sub_group_local_id(); #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); @@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { const int index = _ni * (MWI/VWM) + _mi; #if USE_SUBGROUP_SHUFFLING == 1 - const realN aval = intel_sub_group_shuffle(apm[_ki], _ni); + const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni); #else const realN aval = apm[_ni * (KREG/VWN) + _ki]; #endif |