diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-05 22:18:59 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-02-05 22:18:59 +0100 |
commit | c248f900c036e1d1644e2cc744c45c94f61c5835 (patch) | |
tree | 9667cb393e82e8ed964ecc2ed5ae6296becf8511 /src/routines/level3 | |
parent | e7cbb5915aef16f3a64566292459eaede5a600e5 (diff) | |
parent | fd471e380c54d5496ca1e2b7304408d27a9c7649 (diff) |
Merge branch 'development' into triangular_solvers
Diffstat (limited to 'src/routines/level3')
-rw-r--r-- | src/routines/level3/xgemm.cpp | 18 | ||||
-rw-r--r-- | src/routines/level3/xhemm.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xhemm.hpp | 1 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cpp | 17 | ||||
-rw-r--r-- | src/routines/level3/xherk.cpp | 13 | ||||
-rw-r--r-- | src/routines/level3/xsymm.cpp | 15 | ||||
-rw-r--r-- | src/routines/level3/xsymm.hpp | 1 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 13 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cpp | 11 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.hpp | 1 | ||||
-rw-r--r-- | src/routines/level3/xtrsm.cpp | 3 | ||||
-rw-r--r-- | src/routines/level3/xtrsm.hpp | 2 |
13 files changed, 41 insertions, 60 deletions
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0015b629..7bd388c1 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -150,9 +150,6 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled; const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 && a_do_transpose == false && a_conjugate == false; @@ -178,7 +175,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, a_one_i, a_two_i, a_one_i, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_do_transpose, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -189,7 +186,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, b_one, b_two, b_ld, b_offset, b_buffer, b_one_i, b_two_i, b_one_i, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, b_do_transpose, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -200,13 +197,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, c_one, c_two, c_ld, c_offset, c_buffer, c_one_i, c_two_i, c_one_i, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_do_transpose, false); eventWaitList.push_back(eventProcessC); } // Retrieves the Xgemm kernel from the compiled binary - auto kernel = Kernel(program, "Xgemm"); + auto kernel = Kernel(program_, "Xgemm"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_ceiled)); @@ -236,7 +233,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, c_one_i, c_two_i, c_one_i, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_do_transpose, false); } } @@ -255,13 +252,10 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k, const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, const bool a_conjugate, const bool b_conjugate) { - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Retrieves the proper XgemmDirect kernel from the compiled binary const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") : (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN"); - auto kernel = Kernel(program, name); + auto kernel = Kernel(program_, name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m)); diff --git a/src/routines/level3/xhemm.cpp b/src/routines/level3/xhemm.cpp index e5b1502a..8629f3de 100644 --- a/src/routines/level3/xhemm.cpp +++ b/src/routines/level3/xhemm.cpp @@ -58,8 +58,7 @@ void Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the hermitian-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xhemm.hpp b/src/routines/level3/xhemm.hpp index 2385706e..7c011915 100644 --- a/src/routines/level3/xhemm.hpp +++ b/src/routines/level3/xhemm.hpp @@ -30,6 +30,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index ee3bb8b8..2aed2781 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -81,9 +81,6 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false && ab_conjugate == false; @@ -116,7 +113,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessA1); } @@ -125,7 +122,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessA2); } @@ -134,7 +131,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b1_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, ab_conjugate); eventWaitList.push_back(eventProcessB1); } @@ -143,7 +140,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b2_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, !ab_conjugate); eventWaitList.push_back(eventProcessB2); } @@ -154,12 +151,12 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -201,7 +198,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index ae8e9324..d982859e 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -79,9 +79,6 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false && a_conjugate == false; @@ -109,7 +106,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, a_conjugate); eventWaitList.push_back(eventProcessA); } @@ -118,7 +115,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, b_conjugate); eventWaitList.push_back(eventProcessB); } @@ -129,12 +126,12 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -163,7 +160,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, true); } diff --git a/src/routines/level3/xsymm.cpp b/src/routines/level3/xsymm.cpp index d7f771d1..969edfc8 100644 --- a/src/routines/level3/xsymm.cpp +++ b/src/routines/level3/xsymm.cpp @@ -30,12 +30,12 @@ Xsymm<T>::Xsymm(Queue &queue, EventPointer event, const std::string &name): // The main routine template <typename T> void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, - const size_t m, const size_t n, - const T alpha, - const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, - const T beta, - const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { + const size_t m, const size_t n, + const T alpha, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { throw BLASError(StatusCode::kInvalidDimension); } @@ -58,8 +58,7 @@ void Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the symmetric-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xsymm.hpp b/src/routines/level3/xsymm.hpp index ee965364..7a584560 100644 --- a/src/routines/level3/xsymm.hpp +++ b/src/routines/level3/xsymm.hpp @@ -32,6 +32,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index cb0e0461..fdef43dc 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -77,9 +77,6 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && ab_rotated == false; @@ -103,7 +100,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, ab_one, ab_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -112,7 +109,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList, ab_one, ab_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, b_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, ab_rotated, false); eventWaitList.push_back(eventProcessB); } @@ -123,12 +120,12 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -168,7 +165,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index bd6c4b25..9588c28c 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -74,9 +74,6 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp // Decides which kernel to run: the upper-triangular or lower-triangular version auto kernel_name = (triangle == Triangle::kUpper) ? "XgemmUpper" : "XgemmLower"; - // Loads the program from the database - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && a_rotated == false; @@ -97,7 +94,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList, a_one, a_two, a_ld, a_offset, a_buffer, n_ceiled, k_ceiled, n_ceiled, 0, a_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, a_rotated, false); eventWaitList.push_back(eventProcessA); } @@ -108,12 +105,12 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList, n, n, c_ld, c_offset, c_buffer, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, true, c_rotated, false); eventWaitList.push_back(eventProcessC); // Retrieves the XgemmUpper or XgemmLower kernel from the compiled binary - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); @@ -142,7 +139,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList, n_ceiled, n_ceiled, n_ceiled, 0, c_temp, n, n, c_ld, c_offset, c_buffer, - ConstantOne<T>(), program, + ConstantOne<T>(), program_, false, c_rotated, false, upper, lower, false); } diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index ed810e72..02c295ac 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -70,8 +70,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); - auto kernel = Kernel(program, kernel_name); + auto kernel = Kernel(program_, kernel_name); // Sets the arguments for the triangular-to-squared kernel kernel.SetArgument(0, static_cast<int>(k)); diff --git a/src/routines/level3/xtrmm.hpp b/src/routines/level3/xtrmm.hpp index 967bf132..e77b7214 100644 --- a/src/routines/level3/xtrmm.hpp +++ b/src/routines/level3/xtrmm.hpp @@ -31,6 +31,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; + using Xgemm<T>::program_; using Xgemm<T>::db_; using Xgemm<T>::DoGemm; diff --git a/src/routines/level3/xtrsm.cpp b/src/routines/level3/xtrsm.cpp index 8061b508..3a910261 100644 --- a/src/routines/level3/xtrsm.cpp +++ b/src/routines/level3/xtrsm.cpp @@ -79,9 +79,8 @@ void Xtrsm<T>::DoTrsm(const Layout layout, const Side side, const Triangle trian // Fills the output buffer with zeros auto eventWaitList = std::vector<Event>(); - const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), "TRSM"); auto fill_matrix_event = Event(); - FillMatrix(queue_, device_, program, db_, fill_matrix_event.pointer(), eventWaitList, + FillMatrix(queue_, device_, program_, db_, fill_matrix_event.pointer(), eventWaitList, x_one, x_ld, x_offset, x_buffer, ConstantZero<T>()); fill_matrix_event.WaitForCompletion(); diff --git a/src/routines/level3/xtrsm.hpp b/src/routines/level3/xtrsm.hpp index 288e9d11..b9d5432a 100644 --- a/src/routines/level3/xtrsm.hpp +++ b/src/routines/level3/xtrsm.hpp @@ -26,11 +26,11 @@ class Xtrsm: public Xgemm<T> { public: // Uses methods and variables the Xgemm routine - using Xgemm<T>::routine_name_; using Xgemm<T>::queue_; using Xgemm<T>::context_; using Xgemm<T>::device_; using Xgemm<T>::db_; + using Xgemm<T>::program_; using Xgemm<T>::DoGemm; // Constructor |