diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-05 22:18:59 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-05 22:18:59 +0100 |
commit | c248f900c036e1d1644e2cc744c45c94f61c5835 (patch) | |
tree | 9667cb393e82e8ed964ecc2ed5ae6296becf8511 /src/routines | |
parent | e7cbb5915aef16f3a64566292459eaede5a600e5 (diff) | |
parent | fd471e380c54d5496ca1e2b7304408d27a9c7649 (diff) |
Merge branch 'development' into triangular_solvers
Diffstat (limited to 'src/routines')
30 files changed, 67 insertions, 110 deletions
diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 8046c0be..bdea0086 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -19,8 +19,8 @@ #include <string> #include <vector> -#include "clblast.h" #include "clpp11.hpp" +#include "clblast.h" #include "database/database.hpp" namespace clblast { diff --git a/src/routines/level1/xamax.cpp b/src/routines/level1/xamax.cpp index e9efa1a7..40a66517 100644 --- a/src/routines/level1/xamax.cpp +++ b/src/routines/level1/xamax.cpp @@ -43,9 +43,8 @@ void Xamax<T>::DoAmax(const size_t n, TestVectorIndex(1, imax_buffer, imax_offset); // Retrieves the Xamax kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xamax"); - auto kernel2 = Kernel(program, "XamaxEpilogue"); + auto kernel1 = Kernel(program_, "Xamax"); + auto kernel2 = Kernel(program_, "XamaxEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xasum.cpp b/src/routines/level1/xasum.cpp index a242a5fa..b93b271c 100644 --- a/src/routines/level1/xasum.cpp +++ b/src/routines/level1/xasum.cpp @@ -43,9 +43,8 @@ void Xasum<T>::DoAsum(const size_t n, TestVectorScalar(1, asum_buffer, asum_offset); // Retrieves the Xasum kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xasum"); - auto kernel2 = Kernel(program, "XasumEpilogue"); + auto kernel1 = Kernel(program_, "Xasum"); + auto kernel2 = Kernel(program_, "XasumEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5436c5b7..39f61ef4 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -52,8 +52,7 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy"; // Retrieves the Xaxpy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xcopy.cpp b/src/routines/level1/xcopy.cpp index d86200c0..62889764 100644 --- a/src/routines/level1/xcopy.cpp +++ b/src/routines/level1/xcopy.cpp @@ -52,8 +52,7 @@ void Xcopy<T>::DoCopy(const size_t n, auto kernel_name = (use_fast_kernel) ? "XcopyFast" : "Xcopy"; // Retrieves the Xcopy kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xdot.cpp b/src/routines/level1/xdot.cpp index 9d718913..9f9c0590 100644 --- a/src/routines/level1/xdot.cpp +++ b/src/routines/level1/xdot.cpp @@ -46,9 +46,8 @@ void Xdot<T>::DoDot(const size_t n, TestVectorScalar(1, dot_buffer, dot_offset); // Retrieves the Xdot kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xdot"); - auto kernel2 = Kernel(program, "XdotEpilogue"); + auto kernel1 = Kernel(program_, "Xdot"); + auto kernel2 = Kernel(program_, "XdotEpilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xnrm2.cpp b/src/routines/level1/xnrm2.cpp index 373820a4..aa341aff 100644 --- a/src/routines/level1/xnrm2.cpp +++ b/src/routines/level1/xnrm2.cpp @@ -43,9 +43,8 @@ void Xnrm2<T>::DoNrm2(const size_t n, TestVectorScalar(1, nrm2_buffer, nrm2_offset); // Retrieves the Xnrm2 kernels from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel1 = Kernel(program, "Xnrm2"); - auto kernel2 = Kernel(program, "Xnrm2Epilogue"); + auto kernel1 = Kernel(program_, "Xnrm2"); + auto kernel2 = Kernel(program_, "Xnrm2Epilogue"); // Creates the buffer for intermediate values auto temp_size = 2*db_["WGS2"]; diff --git a/src/routines/level1/xscal.cpp b/src/routines/level1/xscal.cpp index 0521b1e5..9bc096e5 100644 --- a/src/routines/level1/xscal.cpp +++ b/src/routines/level1/xscal.cpp @@ -49,8 +49,7 @@ void Xscal<T>::DoScal(const size_t n, const T alpha, auto kernel_name = (use_fast_kernel) ? "XscalFast" : "Xscal"; // Retrieves the Xscal kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level1/xswap.cpp b/src/routines/level1/xswap.cpp index c9b97dc9..f046575f 100644 --- a/src/routines/level1/xswap.cpp +++ b/src/routines/level1/xswap.cpp @@ -52,8 +52,7 @@ void Xswap<T>::DoSwap(const size_t n, auto kernel_name = (use_fast_kernel) ? "XswapFast" : "Xswap"; // Retrieves the Xswap kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments if (use_fast_kernel) { diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 52e66de6..7d2e5f60 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -123,8 +123,7 @@ void Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, } // Retrieves the Xgemv kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_real)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index d16ebd11..9ec156a1 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -53,8 +53,7 @@ void Xger<T>::DoGer(const Layout layout, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xger"); + auto kernel = Kernel(program_, "Xger"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(a_one)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index 6c334e63..ba12a3ef 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -67,8 +67,7 @@ void Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, const auto matching_alpha = GetAlpha(alpha); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher"); + auto kernel = Kernel(program_, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 11e2c871..a420e693 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -54,8 +54,7 @@ void Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, TestVectorY(n, y_buffer, y_offset, y_inc); // Retrieves the kernel from the compiled binary - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, "Xher2"); + auto kernel = Kernel(program_, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); diff --git a/src/routines/level2/xtrsv.cpp b/src/routines/level2/xtrsv.cpp index b0e4c5ae..d5d009ff 100644 --- a/src/routines/level2/xtrsv.cpp +++ b/src/routines/level2/xtrsv.cpp @@ -37,9 +37,6 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, if (n > db_["TRSV_BLOCK_SIZE"]) { throw BLASError(StatusCode::kUnexpectedError); }; - // Retrieves the program from the cache - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); - // Translates CLBlast arguments to 0/1 integers for the OpenCL kernel const auto is_unit_diagonal = (diagonal == Diagonal::kNonUnit) ? 0 : 1; const auto is_transposed = ((a_transpose == Transpose::kNo && layout == Layout::kColMajor) || @@ -52,7 +49,7 @@ void Xtrsv<T>::Substitution(const Layout layout, const Triangle triangle, // Retrieves the kernel from the compiled binary const auto kernel_name = (is_upper) ? "trsv_backward" : "trsv_forward"; - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); @@ -94,9 +91,6 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, TestMatrixA(n, n, a_buffer, a_offset, a_ld); TestVectorX(n, b_buffer, b_offset, b_inc); - // Retrieves the program from the cache - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSV"); - // Creates a copy of B to avoid overwriting input while computing output // TODO: Make x with 0 offset and unit increment by creating custom copy-to and copy-from kernels const auto x_offset = b_offset; @@ -108,7 +102,7 @@ void Xtrsv<T>::DoTrsv(const Layout layout, const Triangle triangle, // Fills the output buffer with zeros auto eventWaitList = std::vector<Event>(); auto fill_vector_event = Event(); - FillVector(queue_, device_, program, db_, fill_vector_event.pointer(), eventWaitList, + FillVector(queue_, device_, program_, db_, fill_vector_event.pointer(), eventWaitList, n, x_inc, x_offset, x_buffer, ConstantZero<T>()); fill_vector_event.WaitForCompletion(); diff --git a/src/routines/level2/xtrsv.hpp b/src/routines/level2/xtrsv.hpp index dc3f32f0..67e626a1 100644 --- a/src/routines/level2/xtrsv.hpp +++ b/src/routines/level2/xtrsv.hpp @@ -27,11 +27,11 @@ class Xtrsv: public Xgemv<T> { public: // Uses the generic matrix-vector routine - using Xgemv<T>::routine_name_; using Xgemv<T>::queue_; using Xgemv<T>::context_; using Xgemv<T>::device_; using Xgemv<T>::db_; + using Xgemv<T>::program_; using Xgemv<T>::DoGemv; // Constructor diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0015b629..7bd388c1 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -150,9 +150,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && a_do_transpose == false && a_conjugate == false; @@ -178,7 +175,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -189,7 +186,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -200,13 +197,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary - auto kernel = Kernel(program, "Xgemm"); + auto kernel = Kernel(program_, "Xgemm"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_ceiled)); @@ -236,7 +233,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_do_transpose, false); } } @@ -255,13 +252,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate) { - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); + auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m)); diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index e5b1502a..8629f3de 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the hermitian-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 2385706e..7c011915 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index ee3bb8b8..2aed2781 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false && ab_conjugate == false; @@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessA1); } @@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessA2); } @@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessB1); } @@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessB2); } @@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index ae8e9324..d982859e 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false && a_conjugate == false; @@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index d7f771d1..969edfc8 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } @@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the symmetric-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index ee965364..7a584560 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index cb0e0461..fdef43dc 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false; @@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessB); } @@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index bd6c4b25..9588c28c 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false; @@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index ed810e72..02c295ac 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the triangular-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 967bf132..e77b7214 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 8061b508..3a910261 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -79,9 +79,8 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian // Fills the output buffer with zeros auto eventWaitList = std::vector<Event>(); - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM"); auto fill_matrix_event = Event(); - FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList, + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, x_one, x_ld, x_offset, x_buffer, ConstantZero<T>()); fill_matrix_event.WaitForCompletion(); diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp index 288e9d11..b9d5432a 100644 --- a/src/routines/level3/xtrsm.hpp +++ b/src/routines/level3/xtrsm.hpp @@ -26,11 +26,11 @@ class Xtrsm: public Xgemm<T> { public: // Uses methods and variables the Xgemm routine - using Xgemm<T>::routine_name_; using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; using Xgemm<T>::db_; + using Xgemm<T>::program_; using Xgemm<T>::DoGemm; // Constructor diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp index ffee9b7c..696e694a 100644 --- a/src/routines/levelx/xinvert.cpp +++ b/src/routines/levelx/xinvert.cpp @@ -69,18 +69,15 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto name_postfix = (is_upper) ? "Upper" : "Lower"; - // Retrieves the program from the cache - auto event_wait_list = std::vector<Event>(); - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "INVERT"); - // Fills the output buffer with zeros + auto event_wait_list = std::vector<Event>(); auto fill_matrix_event = Event(); - FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), event_wait_list, + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), event_wait_list, num_blocks * block_size, block_size, 0, dest, ConstantZero<T>()); event_wait_list.push_back(fill_matrix_event); // Inverts the diagonal IB by IB inner blocks of the matrix: one block per work-group - auto kernel = Kernel(program, "InvertDiagonalBlock"); + auto kernel = Kernel(program_, "InvertDiagonalBlock"); kernel.SetArgument(0, static_cast<int>(n)); kernel.SetArgument(1, src()); kernel.SetArgument(2, static_cast<int>(offset)); @@ -110,7 +107,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle const auto global = std::vector<size_t>{(current_size/local[1]), npages*(current_size/16)*local[1]}; // Part 1 - auto kernel1 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); + auto kernel1 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part1" + name_postfix); kernel1.SetArgument(0, static_cast<int>(n)); kernel1.SetArgument(1, src()); kernel1.SetArgument(2, static_cast<int>(offset)); @@ -125,7 +122,7 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle // Part 2 const bool is_last_kernel = (current_size * 2 >= block_size); - auto kernel2 = Kernel(program, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); + auto kernel2 = Kernel(program_, "TripleMatMul" + ToString(current_size) + "Part2" + name_postfix); kernel2.SetArgument(0, static_cast<int>(n)); kernel2.SetArgument(1, dest()); kernel2.SetArgument(2, static_cast<int>(current_size)); diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp index 875ca7d2..4ae8c056 100644 --- a/src/routines/levelx/xomatcopy.cpp +++ b/src/routines/levelx/xomatcopy.cpp @@ -65,14 +65,11 @@ void Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose, TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld); - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto emptyEventList = std::vector<Event>(); PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, b_one, b_two, b_ld, b_offset, b_buffer, - alpha, program, false, transpose, conjugate); + alpha, program_, false, transpose, conjugate); } // ================================================================================================= |