diff options
author | Ivan Shapovalov <intelfx@intelfx.name> | 2016-11-26 20:53:42 +0300 |
---|---|---|
committer | Ivan Shapovalov <intelfx@intelfx.name> | 2017-01-24 11:56:15 +0300 |
commit | 5bcd92f2974d94e8add31816d3b9d48a42289500 (patch) | |
tree | 51b24e302a08d62058311ead32ab626ce4c11263 /src/routines/level3/xsyr2k.cpp | |
parent | e943fe77d64f42ed1e57c9919de8ca6787760f2b (diff) |
Routine, Cache: generalize, reduce amount of copying in fast path
Implement a generalized Cache<K, V>. Two variants are provided: the
first one is based on std::map, using C++14-specific transparent
std::less<> and generalized std::map::find() to allow searching by tuple
of references. The second one is based on std::vector and O(n) lookup,
but remains C++11-compliant.
Diffstat (limited to 'src/routines/level3/xsyr2k.cpp')
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 13 |
1 files changed, 5 insertions, 8 deletions
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index cb0e0461..fdef43dc 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false; @@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessB); } @@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } |