summaryrefslogtreecommitdiff
path: root/src/utilities
diff options
context:
space:
mode:
Diffstat (limited to 'src/utilities')
-rw-r--r--src/utilities/compile.cpp12
-rw-r--r--src/utilities/utilities.cpp3
-rw-r--r--src/utilities/utilities.hpp1
3 files changed, 13 insertions, 3 deletions
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index aeb7a3e2..59aa6107 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -37,13 +37,13 @@ std::shared_ptr<Program> CompileFromSource(
// Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
// which it is known to work with all OpenCL platforms.
- if (device.IsNVIDIA() || device.IsARM()) {
+ if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
header_string += "#define USE_INLINE_KEYWORD 1\n";
}
// For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
// performance, but might result in a reduced accuracy.
- if (device.IsAMD() && device.IsGPU()) {
+ if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
header_string += "#define USE_CL_MAD 1\n";
}
@@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource(
// For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
// performance through better cache behaviour
- if (device.IsARM() && device.IsGPU()) {
+ if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) {
header_string += "#define GLOBAL_MEM_FENCE 1\n";
}
@@ -77,6 +77,12 @@ std::shared_ptr<Program> CompileFromSource(
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
}
}
+
+ // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
+ // This option compiles without the workgroup size requirement and does not affect correctness.
+ if (device.IsQualcomm()) {
+ header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
+ }
// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 29161e74..fbdcf9c2 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -463,6 +463,9 @@ std::string GetDeviceArchitecture(const Device& device) {
else if (device.HasExtension(kKhronosAttributesAMD)) {
device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm
}
+ else if ((device.IsQualcomm() && device.IsGPU())) { // queries the Adreno GPU architecture version
+ device_architecture = device.AdrenoVersion();
+ }
// Note: no else - 'device_architecture' might be the empty string
#endif
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index b66df118..3ed51dd6 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -133,6 +133,7 @@ constexpr auto kBufMatB = "B";
constexpr auto kBufMatC = "C";
constexpr auto kBufMatAP = "AP";
constexpr auto kBufScalar = "Scalar";
+constexpr auto kBufScalarUint = "ScalarUint";
// =================================================================================================