diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-08-02 08:35:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-02 08:35:32 +0200 |
commit | 2bea758165cd8c784d1fafe60a949154c2e10000 (patch) | |
tree | 36a58d77e2900ed71c4f66de1bb32e68cbe7a21a | |
parent | bed10d273118e39ef49cf2aea7d69c4194a8384f (diff) | |
parent | 503ab74f020fe764fd2bd69d60ecd72f758b11a2 (diff) |
Merge pull request #309 from CNugteren/CLBlast-306-omatcopy-conjugate
Fixes bug in conjugate transpose not being executed
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | src/routines/common.hpp | 9 | ||||
-rw-r--r-- | test/correctness/testblas.cpp | 2 | ||||
-rw-r--r-- | test/routines/levelx/xomatcopy.hpp | 4 | ||||
-rw-r--r-- | test/test_utilities.cpp | 10 | ||||
-rw-r--r-- | test/test_utilities.hpp | 4 |
6 files changed, 25 insertions, 5 deletions
@@ -2,6 +2,7 @@ Development (next version) - Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') - The tuners now check beforehand on invalid local thread sizes and skip those completely +- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Various minor fixes and enhancements diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c30a2e0e..c6db0152 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, // Determines the right kernel auto kernel_name = std::string{}; + auto pad_kernel = false; if (do_transpose) { if (use_fast_kernel && IsMultiple(src_ld, db["TRA_WPT"]) && @@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; + pad_kernel = (do_pad || do_conjugate); + kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { @@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; + pad_kernel = do_pad; + kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix"; } } @@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); kernel.SetArgument(10, GetRealArg(alpha)); - if (do_pad) { + if (pad_kernel) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } else { diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index 3c92565e..d28aba40 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st } // Tests the error count (should be zero) - TestErrorCount(errors, get_id1_(args)*get_id2_(args), args); + TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args); } TestEnd(); } diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp index ea35dbe2..4a93b29d 100644 --- a/test/routines/levelx/xomatcopy.hpp +++ b/test/routines/levelx/xomatcopy.hpp @@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host) const auto b_two = (b_rotated) ? id1 : id2; const auto a_index = a_two * args.a_ld + a_one + args.a_offset; const auto b_index = b_two * args.b_ld + b_one + args.b_offset; - buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index]; + auto a_value = buffers_host.a_mat[a_index]; + if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); } + buffers_host.b_mat[b_index] = args.alpha * a_value; } } return StatusCode::kSuccess; diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp index 59ec949d..c43200b9 100644 --- a/test/test_utilities.cpp +++ b/test/test_utilities.cpp @@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value // ================================================================================================= +// Performs a complex conjugate if complex +template <typename T> T ComplexConjugate(const T value) { return value; } +template half ComplexConjugate(const half); +template float ComplexConjugate(const float); +template double ComplexConjugate(const double); +template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; } +template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; } + +// ================================================================================================= + template <typename T, typename U> void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host, Queue &queue, const std::vector<std::string> &names) { diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp index 42660bdb..7bf5e65f 100644 --- a/test/test_utilities.hpp +++ b/test/test_utilities.hpp @@ -70,6 +70,10 @@ struct BuffersHost { // ================================================================================================= +template <typename T> T ComplexConjugate(const T value); + +// ================================================================================================= + // Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast // data-types such as the Layout and Transpose data-types. template <typename T> |