summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-14 19:50:47 -0400
committerTyler Sorensen <tylersorensen3221@hotmail.com>2018-07-14 19:50:47 -0400
commit7709a7308bce5492e06d8867a4dd9dff5b2ba950 (patch)
treeed35acf41257752ec165480c2298edf17080da4c
parent36093429fd444d0a1fc7de25dfaf7f2f775cfabc (diff)
Applied feedback from Cedric from first pull request
-rw-r--r--src/clpp11.hpp2
-rw-r--r--src/cupp11.hpp5
-rw-r--r--src/kernels/level3/xgemm_part1.opencl21
-rw-r--r--src/kernels/level3/xgemm_part3.opencl10
-rw-r--r--src/tuning/kernels/xgemm.cpp4
-rw-r--r--src/utilities/compile.cpp26
6 files changed, 36 insertions, 32 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 690f8c49..8ac0523f 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -44,7 +44,7 @@
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
#include <cstdio> // fprintf, stderr
-#include "assert.h"
+#include <assert.h>
// OpenCL
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index a1cb1614..ce765844 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -327,6 +327,11 @@ public:
std::string AMDBoardName() const { return ""; }
std::string NVIDIAComputeCapability() const { return Capabilities(); }
+ // Returns if the Nvidia chip is a Volta or later archicture (major version 7 or higher)
+ bool IsPostNVIDIAVolta() const {
+ return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7;
+ }
+
// Retrieves the above extra information
std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 9e483b3e..32386312 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,26 +114,27 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif
-#ifndef NVIDIA_WARPS_AS_SUBGROUPS
- #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
#endif
-#ifndef NVIDIA_POST_VOLTA
- #define NVIDIA_POST_VOLTA 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+ #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
#endif
-#ifndef INTEL_SUBGROUP_EXTENSION
- #define INTEL_SUBGROUP_EXTENSION 0
+#ifndef SUBGROUP_SHUFFLING_INTEL
+ #define SUBGROUP_SHUFFLING_INTEL 0
#endif
-//#ifndef USE_SUBGROUP_SHUFFLING
+#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
-//#endif
+#endif
// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif
// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
-#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \
+ SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
#define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
#endif
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 8e20b1b8..35ec735c 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -24,11 +24,11 @@ R"(
INLINE_FUNC int clblast_get_sub_group_local_id() {
// Intel extension
- #if INTEL_SUBGROUP_EXTENSION == 1
+ #if SUBGROUP_SHUFFLING_INTEL == 1
return get_sub_group_local_id();
// Nvidia inline PTX
- #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+ #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
int ret;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
return ret;
@@ -38,14 +38,14 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
// Intel extension
- #if INTEL_SUBGROUP_EXTENSION == 1
+ #if SUBGROUP_SHUFFLING_INTEL == 1
return intel_sub_group_shuffle(reg, src);
// Nvidia inline PTX
// Volta and later requires .sync shuffle instructions with an extra mask arg
- #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+ #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
realN ret;
- #if NVIDIA_POST_VOLTA == 1
+ #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
#else
asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..10164c41 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
- StartVariation<1>(argc, argv);
- StartVariation<2>(argc, argv);
+ //StartVariation<1>(argc, argv);
+ //StartVariation<2>(argc, argv);
StartVariation<11>(argc, argv);
StartVariation<12>(argc, argv);
return 0;
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index cd0b3d2b..835f54b4 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -58,24 +58,22 @@ std::shared_ptr<Program> CompileFromSource(
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
- // For GPUs with subgroup support, use subgroup shuffling.
- // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit)
- if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) ||
- (device.IsNVIDIA() && static_cast<int>(precision) == 32))) {
+ // For Intel GPUs with subgroup support, use subgroup shuffling.
+ if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
+ header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
+ }
- // Define the flavor of subgroup
- if (device.IsNVIDIA()) {
- header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n";
+ // For NVIDIA GPUs, inline PTX can provide subgroup support
+ if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
+ header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
- // Nvidia additionally needs to check pre or post volta due to new
- // shuffle commands
- if (device.IsPostNVIDIAVolta()) {
- header_string += "#define NVIDIA_POST_VOLTA 1\n";
- }
+ // Nvidia needs to check pre or post volta due to new shuffle commands
+ if (device.IsPostNVIDIAVolta()) {
+ header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
}
- else if (device.HasExtension(kKhronosIntelSubgroups)) {
- header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n";
+ else {
+ header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
}
}