summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-08-02 08:35:32 +0200
committerGitHub <noreply@github.com>2018-08-02 08:35:32 +0200
commit2bea758165cd8c784d1fafe60a949154c2e10000 (patch)
tree36a58d77e2900ed71c4f66de1bb32e68cbe7a21a
parentbed10d273118e39ef49cf2aea7d69c4194a8384f (diff)
parent503ab74f020fe764fd2bd69d60ecd72f758b11a2 (diff)
Merge pull request #309 from CNugteren/CLBlast-306-omatcopy-conjugate
Fixes bug in conjugate transpose not being executed
-rw-r--r--CHANGELOG1
-rw-r--r--src/routines/common.hpp9
-rw-r--r--test/correctness/testblas.cpp2
-rw-r--r--test/routines/levelx/xomatcopy.hpp4
-rw-r--r--test/test_utilities.cpp10
-rw-r--r--test/test_utilities.hpp4
6 files changed, 25 insertions, 5 deletions
diff --git a/CHANGELOG b/CHANGELOG
index f6d05df3..3134e7bf 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
Development (next version)
- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
- The tuners now check beforehand on invalid local thread sizes and skip those completely
+- Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY
- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
- Various minor fixes and enhancements
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c30a2e0e..c6db0152 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -76,6 +76,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
// Determines the right kernel
auto kernel_name = std::string{};
+ auto pad_kernel = false;
if (do_transpose) {
if (use_fast_kernel &&
IsMultiple(src_ld, db["TRA_WPT"]) &&
@@ -85,7 +86,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+ pad_kernel = (do_pad || do_conjugate);
+ kernel_name = (pad_kernel) ? "TransposePadMatrix" : "TransposeMatrix";
}
}
else {
@@ -97,7 +99,8 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
}
else {
use_fast_kernel = false;
- kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+ pad_kernel = do_pad;
+ kernel_name = (pad_kernel) ? "CopyPadMatrix" : "CopyMatrix";
}
}
@@ -123,7 +126,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
kernel.SetArgument(10, GetRealArg(alpha));
- if (do_pad) {
+ if (pad_kernel) {
kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 3c92565e..d28aba40 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -239,7 +239,7 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
}
// Tests the error count (should be zero)
- TestErrorCount(errors, get_id1_(args)*get_id2_(args), args);
+ TestErrorCount(errors, get_id1_(args)*get_id2_(args) + kCanarySize, args);
}
TestEnd();
}
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
index ea35dbe2..4a93b29d 100644
--- a/test/routines/levelx/xomatcopy.hpp
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -45,7 +45,9 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
const auto b_two = (b_rotated) ? id1 : id2;
const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
- buffers_host.b_mat[b_index] = args.alpha * buffers_host.a_mat[a_index];
+ auto a_value = buffers_host.a_mat[a_index];
+ if (args.a_transpose == Transpose::kConjugate) { a_value = ComplexConjugate(a_value); }
+ buffers_host.b_mat[b_index] = args.alpha * a_value;
}
}
return StatusCode::kSuccess;
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index 59ec949d..c43200b9 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -31,6 +31,16 @@ template <> bool IsCloseToZero(const double2 value) { return IsCloseToZero(value
// =================================================================================================
+// Performs a complex conjugate if complex
+template <typename T> T ComplexConjugate(const T value) { return value; }
+template half ComplexConjugate(const half);
+template float ComplexConjugate(const float);
+template double ComplexConjugate(const double);
+template <> float2 ComplexConjugate(const float2 value) { return float2{value.real(), -value.imag()}; }
+template <> double2 ComplexConjugate(const double2 value) { return double2{value.real(), -value.imag()}; }
+
+// =================================================================================================
+
template <typename T, typename U>
void DeviceToHost(const Arguments<U> &args, Buffers<T> &buffers, BuffersHost<T> &buffers_host,
Queue &queue, const std::vector<std::string> &names) {
diff --git a/test/test_utilities.hpp b/test/test_utilities.hpp
index 42660bdb..7bf5e65f 100644
--- a/test/test_utilities.hpp
+++ b/test/test_utilities.hpp
@@ -70,6 +70,10 @@ struct BuffersHost {
// =================================================================================================
+template <typename T> T ComplexConjugate(const T value);
+
+// =================================================================================================
+
// Converts a value (e.g. an integer) to a string. This also covers special cases for CLBlast
// data-types such as the Layout and Transpose data-types.
template <typename T>