diff options
-rw-r--r-- | include/internal/routine.h | 9 | ||||
-rw-r--r-- | src/routine.cc | 14 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cc | 8 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cc | 12 | ||||
-rw-r--r-- | src/routines/level3/xherk.cc | 8 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cc | 8 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cc | 6 |
7 files changed, 33 insertions, 32 deletions
diff --git a/include/internal/routine.h b/include/internal/routine.h index acc9a9c8..49a36c10 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -84,17 +84,18 @@ class Routine { StatusCode TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc, const size_t data_size); - // Copies/transposes a matrix and padds/unpads it + // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write + // to symmetric and triangular matrices through optional arguments. StatusCode PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer &dest, + const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, - const bool pad, const bool upper, const bool lower, - const bool diagonal_imag_zero, - const Program &program); + const bool upper = false, const bool lower = false, + const bool diagonal_imag_zero = false); // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception diff --git a/src/routine.cc b/src/routine.cc index d11edb0f..339027d4 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -202,17 +202,17 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size // ================================================================================================= -// Copies a matrix and pads it with zeros +// Copies or transposes a matrix and pads/unpads it with zeros StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, const size_t src_ld, const size_t src_offset, const Buffer &src, const size_t dest_one, const size_t dest_two, const size_t dest_ld, const size_t dest_offset, const Buffer &dest, + const Program &program, const bool do_pad, const bool do_transpose, const bool do_conjugate, - const bool pad, const bool upper, const bool lower, - const bool diagonal_imag_zero, - const Program &program) { + const bool upper, const bool lower, + const bool diagonal_imag_zero) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && @@ -230,7 +230,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr } else { use_fast_kernel = false; - kernel_name = (pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix"; + kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix"; } } else { @@ -242,7 +242,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr } else { use_fast_kernel = false; - kernel_name = (pad) ? "PadMatrix" : "UnPadMatrix"; + kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix"; } } @@ -267,7 +267,7 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); - if (pad) { + if (do_pad) { kernel.SetArgument(10, static_cast<int>(do_conjugate)); } else { diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index f4a9f737..7a854741 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, false, false, false, program); + program, true, a_do_transpose, a_conjugate); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, false, false, false, program); + program, true, b_do_transpose, b_conjugate); if (ErrorIn(status)) { return status; } // Only necessary for matrix C if it used both as input and output if (beta != static_cast<T>(0)) { status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, false, false, false, program); + program, true, c_do_transpose, false); if (ErrorIn(status)) { return status; } } @@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // Runs the post-processing kernel status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, false, false, false, program); + program, false, c_do_transpose, false); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 6d33a0e1..ec435d8e 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -96,25 +96,25 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // fill them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a1, - ab_rotated, ab_conjugate, true, false, false, false, program); + program, true, ab_rotated, ab_conjugate); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a2, - ab_rotated, !ab_conjugate, true, false, false, false, program); + program, true, ab_rotated, !ab_conjugate); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b1, - ab_rotated, ab_conjugate, true, false, false, false, program); + program, true, ab_rotated, ab_conjugate); status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b2, - ab_rotated, !ab_conjugate, true, false, false, false, program); + program, true, ab_rotated, !ab_conjugate); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); + program, true, c_rotated, false); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -159,7 +159,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); + program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 8fae294f..8ad64162 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -92,18 +92,18 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // creates two copies: status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, a_conjugate, true, false, false, false, program); + program, true, a_rotated, a_conjugate); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - a_rotated, b_conjugate, true, false, false, false, program); + program, true, a_rotated, b_conjugate); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); + program, true, c_rotated, false); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -137,7 +137,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, true, program); + program, false, c_rotated, false, upper, lower, true); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index d54f2fc1..651bc524 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -92,18 +92,18 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // fill them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - ab_rotated, false, true, false, false, false, program); + program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - ab_rotated, false, true, false, false, false, program); + program, true, ab_rotated, false); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); + program, true, c_rotated, false); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -145,7 +145,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); + program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } // Successfully finished the computation diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index bb952410..e10b7689 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -87,14 +87,14 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // fill it up until it reaches a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_a, - a_rotated, false, true, false, false, false, program); + program, true, a_rotated, false); if (ErrorIn(status)) { return status; } // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to // modify the other triangle. status = PadCopyTransposeMatrix(n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, temp_c, - c_rotated, false, true, false, false, false, program); + program, true, c_rotated, false); if (ErrorIn(status)) { return status; } // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary @@ -126,7 +126,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const auto lower = (triangle == Triangle::kLower); status = PadCopyTransposeMatrix(n_ceiled, n_ceiled, n_ceiled, 0, temp_c, n, n, c_ld, c_offset, c_buffer, - c_rotated, false, false, upper, lower, false, program); + program, false, c_rotated, false, upper, lower, false); if (ErrorIn(status)) { return status; } // Successfully finished the computation |