diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2023-01-21 20:28:32 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-01-21 20:28:32 +0100 |
commit | e72f87ae5eca5e2ea8aea4f2ce49408c1faa0521 (patch) | |
tree | 588a426b6350a5c982d89d98749ae78667fd23b4 /src/utilities/compile.cpp | |
parent | 03cffa83c5f7742f8ec0c5e762bb7048e38952f3 (diff) | |
parent | 73f49e9b3d4abc4214122e4b8c07a736e01626ee (diff) |
Merge pull request #451 from CodeLinaro/master
CLBlast modifications to address Qualcomm Adreno performance
Diffstat (limited to 'src/utilities/compile.cpp')
-rw-r--r-- | src/utilities/compile.cpp | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index aeb7a3e2..59aa6107 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -37,13 +37,13 @@ std::shared_ptr<Program> CompileFromSource( // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on // which it is known to work with all OpenCL platforms. - if (device.IsNVIDIA() || device.IsARM()) { + if (device.IsNVIDIA() || device.IsARM() || device.IsQualcomm()) { header_string += "#define USE_INLINE_KEYWORD 1\n"; } // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if (device.IsAMD() && device.IsGPU()) { + if ((device.IsAMD() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define USE_CL_MAD 1\n"; } @@ -54,7 +54,7 @@ std::shared_ptr<Program> CompileFromSource( // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize // performance through better cache behaviour - if (device.IsARM() && device.IsGPU()) { + if ((device.IsARM() && device.IsGPU()) || (device.IsQualcomm() && device.IsGPU())) { header_string += "#define GLOBAL_MEM_FENCE 1\n"; } @@ -77,6 +77,12 @@ std::shared_ptr<Program> CompileFromSource( header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n"; } } + + // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance. + // This option compiles without the workgroup size requirement and does not affect correctness. + if (device.IsQualcomm()) { + header_string += "#define RELAX_WORKGROUP_SIZE 1\n"; + } // Optionally adds a translation header from OpenCL kernels to CUDA kernels #ifdef CUDA_API |