diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-07-29 10:26:34 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2018-07-29 10:26:34 +0200 |
commit | 5903820ba2736ec4a5f338c2325f59f511b4a0e8 (patch) | |
tree | 08eaca8cde2e45682807dcfa6db8252ee520d1bf /src/kernels | |
parent | 1c9a74147073234da953b84f0bbafefbcf5ffb4f (diff) | |
parent | dda1e567f872d3d89f2f7cd890fb5b29ff98537c (diff) |
Merge branch 'master' into CLBlast-267-convgemm
Diffstat (limited to 'src/kernels')
-rw-r--r-- | src/kernels/common.opencl | 2 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part1.opencl | 20 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 46 |
3 files changed, 61 insertions, 7 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 4a476a8b..0ad38919 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -260,7 +260,7 @@ R"( // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 -#if USE_STAGGERED_INDICES == 1 +#if USE_STAGGERED_INDICES == 1 && GEMMK == 0 INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 99d64c91..3cfc5dfb 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -114,13 +114,29 @@ R"( #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif -// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) +#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA + #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0 +#endif +#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA + #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0 +#endif +#ifndef SUBGROUP_SHUFFLING_INTEL + #define SUBGROUP_SHUFFLING_INTEL 0 +#endif #ifndef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs #endif -#if USE_SUBGROUP_SHUFFLING == 1 + +// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) +#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif + +// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) +#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA) + #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs +#endif + #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE #undef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index c3920cb5..90de0b3b 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -17,6 +17,44 @@ R"( // ================================================================================================= +// A common interface for subgroup functions + +#if USE_SUBGROUP_SHUFFLING == 1 + +INLINE_FUNC int clblast_get_sub_group_local_id() { + + // Intel extension + #if SUBGROUP_SHUFFLING_INTEL == 1 + return get_sub_group_local_id(); + + // Nvidia inline PTX + #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 + int ret; + asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; + #endif +} + +INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { + + // Intel extension + #if SUBGROUP_SHUFFLING_INTEL == 1 + return intel_sub_group_shuffle(reg, src); + + // Nvidia inline PTX + // Volta and later requires .sync shuffle instructions with an extra mask arg + #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 + realN ret; + #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 + asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src)); + #else + asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); + #endif + return ret; + #endif +} +#endif + // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, @@ -53,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #if GEMMK == 1 const __global real* restrict a_ptr = (const __global real* restrict) &agm[0]; const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0]; - const int tid_x = get_global_id(0); - const int tid_y = get_global_id(1); + const int tid_x = get_local_id(0) + MDIMC * GetGroupID0(); + const int tid_y = get_local_id(1) + NDIMC * GetGroupID1(); #endif // Combined thread identifier (volatile to disable caching) @@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #elif GEMMK == 1 // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups #if USE_SUBGROUP_SHUFFLING == 1 - const int _ni = get_sub_group_local_id(); + const int _ni = clblast_get_sub_group_local_id(); #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); @@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { const int index = _ni * (MWI/VWM) + _mi; #if USE_SUBGROUP_SHUFFLING == 1 - const realN aval = intel_sub_group_shuffle(apm[_ki], _ni); + const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni); #else const realN aval = apm[_ni * (KREG/VWN) + _ki]; #endif |