implemented changes to boost Adreno performance according to https://jira-dc.qualcomm.com/jira/browse/OSR-8731

author: Angus, Alexander <aangus@qti.qualcomm.com> 2023-01-03 10:56:04 -0800
committer: Angus, Alexander <aangus@qti.qualcomm.com> 2023-01-03 10:56:04 -0800
commit: 4f394608a28f419dfd6091c704148d9e638a26f0 (patch)
tree: 4c0e042109c4d249ff5b700fc49a862169edec5a /src/utilities
parent: 03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff)
2 files changed, 12 insertions, 3 deletions
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index aeb7a3e2..7170c30a 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -37,13 +37,13 @@ std::shared_ptr<Program> CompileFromSource(
 
   // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
   // which it is known to work with all OpenCL platforms.
-  if (device.IsNVIDIA() || device.IsARM()) {
+  if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) {
     header_string += "#define USE_INLINE_KEYWORD 1\n";
   }
 
   // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
   // performance, but might result in a reduced accuracy.
-  if (device.IsAMD() && device.IsGPU()) {
+  if ((device.IsAMD() && device.IsGPU()) || device.IsQualcomm()) {
     header_string += "#define USE_CL_MAD 1\n";
   }
 
@@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource(
 
   // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
   // performance through better cache behaviour
-  if (device.IsARM() && device.IsGPU()) {
+  if ((device.IsARM() && device.IsGPU()) || device.IsQualcomm()) {
     header_string += "#define GLOBAL_MEM_FENCE 1\n";
   }
 
@@ -77,6 +77,12 @@ std::shared_ptr<Program> CompileFromSource(
       header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
     }
   }
+
+  // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
+  // This option compiles without the workgroup size requirement and does not affect correctness.
+  if (device.IsQualcomm()) {
+    header_string += "#define RELAX_WORKGROUP_SIZE 1\n";
+  }
   
   // Optionally adds a translation header from OpenCL kernels to CUDA kernels
   #ifdef CUDA_API
diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp
index 29161e74..32de2e2e 100644
--- a/src/utilities/utilities.cpp
+++ b/src/utilities/utilities.cpp
@@ -463,6 +463,9 @@ std::string GetDeviceArchitecture(const Device& device) {
     else if (device.HasExtension(kKhronosAttributesAMD)) {
       device_architecture = device.Name(); // Name is architecture for AMD APP and AMD ROCm
     }
+    else if (device.IsQualcomm()) { // queries the Adreno GPU architecture version
+      device_architecture = device.AdrenoVersion();
+    }
     // Note: no else - 'device_architecture' might be the empty string
   #endif
author	Angus, Alexander <aangus@qti.qualcomm.com>	2023-01-03 10:56:04 -0800
committer	Angus, Alexander <aangus@qti.qualcomm.com>	2023-01-03 10:56:04 -0800
commit	4f394608a28f419dfd6091c704148d9e638a26f0 (patch)
tree	4c0e042109c4d249ff5b700fc49a862169edec5a /src/utilities
parent	03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff)