From 5bcd92f2974d94e8add31816d3b9d48a42289500 Mon Sep 17 00:00:00 2001 From: Ivan Shapovalov Date: Sat, 26 Nov 2016 20:53:42 +0300 Subject: Routine, Cache: generalize, reduce amount of copying in fast path Implement a generalized Cache. Two variants are provided: the first one is based on std::map, using C++14-specific transparent std::less<> and generalized std::map::find() to allow searching by tuple of references. The second one is based on std::vector and O(n) lookup, but remains C++11-compliant. --- src/routines/level3/xsyrk.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/routines/level3/xsyrk.cpp') diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index bd6c4b25..9588c28c 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -74,9 +74,6 @@ void Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transp // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false; @@ -97,7 +94,7 @@ void Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne(), program, + ConstantOne(), program_, true, a_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -108,12 +105,12 @@ void Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne(), program, + ConstantOne(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast(n_ceiled)); @@ -142,7 +139,7 @@ void Xsyrk::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne(), program, + ConstantOne(), program_, false, c_rotated, false, upper, lower, false); } -- cgit v1.2.3