From 5578d5ab282d63ad47a767dcbebb94b83195230d Mon Sep 17 00:00:00 2001 From: CNugteren Date: Wed, 8 Jul 2015 07:25:18 +0200 Subject: Added option to set the imaginary part of the diagonal to zero --- src/routine.cc | 4 +++- src/routines/xgemm.cc | 8 ++++---- src/routines/xsyr2k.cc | 11 ++++------- src/routines/xsyrk.cc | 11 ++++------- 4 files changed, 15 insertions(+), 19 deletions(-) (limited to 'src') diff --git a/src/routine.cc b/src/routine.cc index 4b7ece41..d11edb0f 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -211,12 +211,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr const Buffer &dest, const bool do_transpose, const bool do_conjugate, const bool pad, const bool upper, const bool lower, + const bool diagonal_imag_zero, const Program &program) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && - (upper == false) && (lower == false); + (upper == false) && (lower == false) && (diagonal_imag_zero == false); // Determines the right kernel auto kernel_name = std::string{}; @@ -272,6 +273,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr else { kernel.SetArgument(10, static_cast(upper)); kernel.SetArgument(11, static_cast(lower)); + kernel.SetArgument(12, static_cast(diagonal_imag_zero)); } } diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc index 651ebb55..c8674282 100644 --- a/src/routines/xgemm.cc +++ b/src/routines/xgemm.cc @@ -108,18 +108,18 @@ StatusCode Xgemm::DoGemm(const Layout layout, // them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, false, false, program); + a_do_transpose, a_conjugate, true, false, false, false, program); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, false, false, program); + b_do_transpose, b_conjugate, true, false, false, false, program); if (ErrorIn(status)) { return status; } // Only necessary for matrix C if it used both as input and output if (beta != static_cast(0)) { status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, false, false, program); + c_do_transpose, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } } @@ -151,7 +151,7 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Runs the post-processing kernel status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, false, false, program); + c_do_transpose, false, false, false, false, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/xsyr2k.cc b/src/routines/xsyr2k.cc index a7aa6945..abb8b7eb 100644 --- a/src/routines/xsyr2k.cc +++ b/src/routines/xsyr2k.cc @@ -54,9 +54,6 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo); auto c_rotated = (layout == Layout::kRowMajor); - // In case of complex data-types, the transpose can also become a conjugate transpose - auto ab_conjugate = (ab_transpose == Transpose::kConjugate); - // Computes the first and second dimensions of the A and B matrices taking the layout into account auto ab_one = (ab_rotated) ? k : n; auto ab_two = (ab_rotated) ? n : k; @@ -95,18 +92,18 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons // fill them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - ab_rotated, ab_conjugate, true, false, false, program); + ab_rotated, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - ab_rotated, ab_conjugate, true, false, false, program); + ab_rotated, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, program); + c_rotated, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -148,7 +145,7 @@ StatusCode Xsyr2k::DoSyr2k(const Layout layout, const Triangle triangle, cons auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, program); + c_rotated, false, false, upper, lower, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/xsyrk.cc b/src/routines/xsyrk.cc index d8c150fd..3efa0598 100644 --- a/src/routines/xsyrk.cc +++ b/src/routines/xsyrk.cc @@ -53,9 +53,6 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); auto c_rotated = (layout == Layout::kRowMajor); - // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - // Computes the first and second dimensions of the A matrix taking the layout into account auto a_one = (a_rotated) ? k : n; auto a_two = (a_rotated) ? n : k; @@ -87,17 +84,17 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto& program = GetProgramFromCache(); // Runs the pre-processing kernel. This transposes the matrix A, but also pads zeros to - // fill them up until they reach a certain multiple of size (kernel parameter dependent). + // fill it up until it reaches a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, a_conjugate, true, false, false, program); + a_rotated, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, program); + c_rotated, false, true, false, false, false, program); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -129,7 +126,7 @@ StatusCode Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, program); + c_rotated, false, false, upper, lower, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation -- cgit v1.2.3