diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-01-24 20:16:57 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-01-24 20:16:57 +0100 |
commit | 5e7d140d59fd53789934ec78d8589ff84ce1ef09 (patch) | |
tree | 55f356ce1dc2bf8c8c94a6442ef6ed7c6bc5c63d /src/routines | |
parent | e943fe77d64f42ed1e57c9919de8ca6787760f2b (diff) | |
parent | 5fb1da1a0f09d825e5ddfd1e9bd422a8d74f2c5f (diff) |
Merge pull request #132 from intelfx/cache
Refactor cache subsystem
Diffstat (limited to 'src/routines')
-rw-r--r-- | src/routines/level1/xamax.cpp | 5 | ||||
-rw-r--r-- | src/routines/level1/xasum.cpp | 5 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cpp | 3 | ||||
-rw-r--r-- | src/routines/level1/xcopy.cpp | 3 | ||||
-rw-r--r-- | src/routines/level1/xdot.cpp | 5 | ||||
-rw-r--r-- | src/routines/level1/xnrm2.cpp | 5 | ||||
-rw-r--r-- | src/routines/level1/xscal.cpp | 3 | ||||
-rw-r--r-- | src/routines/level1/xswap.cpp | 3 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cpp | 3 | ||||
-rw-r--r-- | src/routines/level2/xger.cpp | 3 | ||||
-rw-r--r-- | src/routines/level2/xher.cpp | 3 | ||||
-rw-r--r-- | src/routines/level2/xher2.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 18 | ||||
-rw-r--r-- | src/routines/level3/xhemm.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xhemm.hpp | 1 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cpp | 17 | ||||
-rw-r--r-- | src/routines/level3/xherk.cpp | 13 | ||||
-rw-r--r-- | src/routines/level3/xsymm.cpp | 15 | ||||
-rw-r--r-- | src/routines/level3/xsymm.hpp | 1 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 13 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cpp | 11 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.hpp | 1 | ||||
-rw-r--r-- | src/routines/levelx/xomatcopy.cpp | 5 |
24 files changed, 56 insertions, 89 deletions
diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index e9efa1a7..40a66517 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n, TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); + auto kernel1 = Kernel(program_, "Xamax"); + auto kernel2 = Kernel(program_, "XamaxEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index a242a5fa..b93b271c 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n, TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); + auto kernel1 = Kernel(program_, "Xasum"); + auto kernel2 = Kernel(program_, "XasumEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5436c5b7..39f61ef4 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -52,8 +52,7 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index d86200c0..62889764 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index 9d718913..9f9c0590 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n, TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); + auto kernel1 = Kernel(program_, "Xdot"); + auto kernel2 = Kernel(program_, "XdotEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 373820a4..aa341aff 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n, TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + auto kernel1 = Kernel(program_, "Xnrm2"); + auto kernel2 = Kernel(program_, "Xnrm2Epilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index 0521b1e5..9bc096e5 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index c9b97dc9..f046575f 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 7b4c2e8f..9e9c2db4 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -122,8 +122,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, } // Retrieves the Xgemv kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_real)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index d16ebd11..9ec156a1 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xger"); + auto kernel = Kernel(program_, "Xger"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(a_one)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index 6c334e63..ba12a3ef 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher"); + auto kernel = Kernel(program_, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 11e2c871..a420e693 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher2"); + auto kernel = Kernel(program_, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0015b629..7bd388c1 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -150,9 +150,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && a_do_transpose == false && a_conjugate == false; @@ -178,7 +175,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -189,7 +186,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -200,13 +197,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary - auto kernel = Kernel(program, "Xgemm"); + auto kernel = Kernel(program_, "Xgemm"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_ceiled)); @@ -236,7 +233,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_do_transpose, false); } } @@ -255,13 +252,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate) { - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); + auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m)); diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index e5b1502a..8629f3de 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the hermitian-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 2385706e..7c011915 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index ee3bb8b8..2aed2781 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false && ab_conjugate == false; @@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessA1); } @@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessA2); } @@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessB1); } @@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessB2); } @@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index ae8e9324..d982859e 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false && a_conjugate == false; @@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index d7f771d1..969edfc8 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } @@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the symmetric-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index ee965364..7a584560 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index cb0e0461..fdef43dc 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false; @@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessB); } @@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index bd6c4b25..9588c28c 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false; @@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index ed810e72..02c295ac 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the triangular-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 967bf132..e77b7214 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index 875ca7d2..4ae8c056 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -65,14 +65,11 @@ void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto emptyEventList = std::vector<Event>(); PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); + alpha, program_, false, transpose, conjugate); } // ================================================================================================= |