diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-07-31 21:49:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-31 21:49:53 +0200 |
commit | bed10d273118e39ef49cf2aea7d69c4194a8384f (patch) | |
tree | 2e034a7c9751741bb6b2b03a0e619d2f54dcd4cf | |
parent | d749c4af7201a462f3ee2f2105528d14a9efa4d1 (diff) | |
parent | 2b76bfee97fb775128126506f695535e38ea41ef (diff) |
Merge pull request #308 from CNugteren/CLBlast-301-weird-AMD-Hainan-bug
Added workaround for AMD Southern Islands GPU issue
-rw-r--r-- | CMakeLists.txt | 6 | ||||
-rw-r--r-- | src/clpp11.hpp | 32 |
2 files changed, 35 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index fea08287..fb62ae27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,12 @@ option(TESTS "Enable compilation of the correctness tests" OFF) option(NETLIB "Enable compilation of the CBLAS Netlib API" OFF) option(CUBLAS "Enables performance comparison against cuBLAS on NVIDIA GPUs" OFF) +# Workarounds for bugs +option(AMD_SI_EMPTY_KERNEL_WORKAROUND "Enables workaround for bug in AMD Southern Island GPUs" OFF) +if(AMD_SI_EMPTY_KERNEL_WORKAROUND) + add_definitions(-DAMD_SI_EMPTY_KERNEL_WORKAROUND) +endif() + # Select between an OpenCL API (default) or a CUDA API (beta) option(OPENCL "Build CLBlast with an OpenCL API (default)" ON) option(CUDA "Build CLBlast with a CUDA API (beta)" OFF) diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 8ac0523f..70da2329 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -447,8 +447,14 @@ class Program { // Source-based constructor with memory management explicit Program(const Context &context, const std::string &source) { - const char *source_ptr = &source[0]; - const auto length = source.length(); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + const std::string source_null_kernel = source + "\n__kernel void null_kernel() {}\n"; + const char *source_ptr = &source_null_kernel[0]; + const auto length = source_null_kernel.length(); + #else + const char *source_ptr = &source[0]; + const auto length = source.length(); + #endif auto status = CL_SUCCESS; program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status); CLCudaAPIError::Check(status, "clCreateProgramWithSource"); @@ -764,10 +770,21 @@ class Kernel { kernel_(new cl_kernel, [](cl_kernel* k) { if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } delete k; - }) { + }) + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + , null_kernel_(new cl_kernel, [](cl_kernel* k) { + if (*k) { CheckErrorDtor(clReleaseKernel(*k)); } + delete k; + }) + #endif + { auto status = CL_SUCCESS; *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status); CLCudaAPIError::Check(status, "clCreateKernel"); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + *null_kernel_ = clCreateKernel(program->operator()(), "null_kernel", &status); + CLCudaAPIError::Check(status, "clCreateKernel"); + #endif } // Sets a kernel argument at the indicated position @@ -831,12 +848,21 @@ class Kernel { static_cast<cl_uint>(waitForEventsPlain.size()), !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr, event)); + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + const std::vector<size_t> nullRange = {1}; + CheckError(clEnqueueNDRangeKernel(queue(), *null_kernel_, static_cast<cl_uint>(nullRange.size()), + nullptr, nullRange.data(), nullptr, + 0, nullptr, nullptr)); + #endif } // Accessor to the private data-member const cl_kernel& operator()() const { return *kernel_; } private: std::shared_ptr<cl_kernel> kernel_; + #ifdef AMD_SI_EMPTY_KERNEL_WORKAROUND + std::shared_ptr<cl_kernel> null_kernel_; + #endif // Internal implementation for the recursive SetArguments function. template <typename T> |