summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-11 15:12:22 -0400
committerTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-11 15:12:22 -0400
commit7f2e98a1406da6c5293f0c988df95edc246ef88d (patch)
tree5fdb94393d54cff495f7f2e6a2f0edc6df1eebd8
parent7bae54f61f8a2b589421afd57c9da6c8775155ef (diff)
added inline ptx to support shuffle on Nvidia GPUs
-rw-r--r--src/clpp11.hpp7
-rw-r--r--src/kernels/level3/xgemm_part1.opencl24
-rw-r--r--src/kernels/level3/xgemm_part3.opencl42
-rw-r--r--src/tuning/kernels/xgemm.cpp4
-rw-r--r--src/tuning/kernels/xgemm.hpp20
-rw-r--r--src/utilities/compile.cpp22
6 files changed, 107 insertions, 12 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 8d6a1127..690f8c49 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -44,6 +44,7 @@
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
#include <cstdio> // fprintf, stderr
+#include "assert.h"
// OpenCL
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
@@ -355,6 +356,12 @@ class Device {
std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
}
+ // Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher)
+ bool IsPostNVIDIAVolta() const {
+ assert(HasExtension("cl_nv_device_attribute_query"));
+ return GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7;
+ }
+
// Retrieves the above extra information (if present)
std::string GetExtraInfo() const {
if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..9e483b3e 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#ifndef USE_SUBGROUP_SHUFFLING
- #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
+#ifndef NVIDIA_WARPS_AS_SUBGROUPS
+ #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#endif
+#ifndef NVIDIA_POST_VOLTA
+ #define NVIDIA_POST_VOLTA 0
#endif
-#if USE_SUBGROUP_SHUFFLING == 1
+#ifndef INTEL_SUBGROUP_EXTENSION
+ #define INTEL_SUBGROUP_EXTENSION 0
+#endif
+//#ifndef USE_SUBGROUP_SHUFFLING
+ #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
+//#endif
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+ #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index c3920cb5..8e20b1b8 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,6 +17,44 @@ R"(
// =================================================================================================
+// A common interface for subgroup functions
+
+#if USE_SUBGROUP_SHUFFLING == 1
+
+INLINE_FUNC int clblast_get_sub_group_local_id() {
+
+ // Intel extension
+ #if INTEL_SUBGROUP_EXTENSION == 1
+ return get_sub_group_local_id();
+
+ // Nvidia inline PTX
+ #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+ int ret;
+ asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
+ return ret;
+ #endif
+}
+
+INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
+
+ // Intel extension
+ #if INTEL_SUBGROUP_EXTENSION == 1
+ return intel_sub_group_shuffle(reg, src);
+
+ // Nvidia inline PTX
+ // Volta and later requires .sync shuffle instructions with an extra mask arg
+ #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+ realN ret;
+ #if NVIDIA_POST_VOLTA == 1
+ asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
+ #else
+ asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
+ #endif
+ return ret;
+ #endif
+}
+#endif
+
// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
@@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#elif GEMMK == 1
// Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
#if USE_SUBGROUP_SHUFFLING == 1
- const int _ni = get_sub_group_local_id();
+ const int _ni = clblast_get_sub_group_local_id();
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
@@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
const int index = _ni * (MWI/VWM) + _mi;
#if USE_SUBGROUP_SHUFFLING == 1
- const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+ const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..10164c41 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
- StartVariation<1>(argc, argv);
- StartVariation<2>(argc, argv);
+ //StartVariation<1>(argc, argv);
+ //StartVariation<2>(argc, argv);
StartVariation<11>(argc, argv);
StartVariation<12>(argc, argv);
return 0;
diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp
index 9a538c1b..c1b048b7 100644
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@@ -116,7 +116,7 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
};
}
else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively
- settings.parameters = {
+ /*settings.parameters = {
{"GEMMK", {1}},
{"MWG", {16, 32, 64}},
{"NWG", {16, 32, 64}},
@@ -133,6 +133,24 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
{"SA", {0}},
{"SB", {0}},
{"KREG", {1, 2, 4}}
+ };*/
+ settings.parameters = {
+ { "GEMMK",{ 1 } },
+ { "MWG",{ 16, 32, 64 } },
+ { "NWG",{ 64 } }, // This divided by NDIMC needs to be 32
+ { "KWG",{ 1 } },
+ { "MDIMC",{ 64 } }, // This needs to be greater than 32
+ { "NDIMC",{ 2 } },
+ { "MDIMA",{ 64 } }, // This needs to be equal to MDIMC
+ { "NDIMB",{ 2 } }, // This needs to be equal to NDIMC
+ { "KWI",{ 1 } },
+ { "VWM",{ 1, 2, 4, 8 } },
+ { "VWN",{ 1, 2, 4 } },
+ { "STRM",{ 0 } },
+ { "STRN",{ 0 } },
+ { "SA",{ 0 } },
+ { "SB",{ 0 } },
+ { "KREG",{ 1, 2, 4 } }
};
}
else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 05c29944..cd0b3d2b 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -58,11 +58,27 @@ std::shared_ptr<Program> CompileFromSource(
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
- // For Intel GPUs with subgroup support, use subgroup shuffling.
- if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+ // For GPUs with subgroup support, use subgroup shuffling.
+ // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit)
+ if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) ||
+ (device.IsNVIDIA() && static_cast<int>(precision) == 32))) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
- }
+ // Define the flavor of subgroup
+ if (device.IsNVIDIA()) {
+ header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n";
+
+ // Nvidia additionally needs to check pre or post volta due to new
+ // shuffle commands
+ if (device.IsPostNVIDIAVolta()) {
+ header_string += "#define NVIDIA_POST_VOLTA 1\n";
+ }
+ }
+ else if (device.HasExtension(kKhronosIntelSubgroups)) {
+ header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n";
+ }
+ }
+
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
header_string +=