From 798d32edad091b6faaa1627a7514868fc28c5fd9 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 17 Jul 2016 14:36:51 +0200 Subject: Improved the GEMM direct kernel by adding register blocking. Still not fast though --- src/routines/level3/xgemm.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'src/routines') diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 4bdf3192..d6ba2c32 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -284,10 +284,13 @@ StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(18, static_cast(b_conjugate)); // Computes the global and local thread sizes - const auto m_ceiled = Ceil(m, 16); - const auto n_ceiled = Ceil(n, 16); - const auto global = std::vector{m_ceiled, n_ceiled}; - const auto local = std::vector{16, 16}; + const auto m_ceiled = Ceil(m, db_["MWG"]); + const auto n_ceiled = Ceil(n, db_["NWG"]); + const auto global = std::vector{ + (m_ceiled * db_["MDIMC"]) / db_["MWG"], + (n_ceiled * db_["NDIMC"]) / db_["NWG"] + }; + const auto local = std::vector{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto status = RunKernel(kernel, queue_, device_, global, local, event_); -- cgit v1.2.3