summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-07-17 14:36:51 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-07-17 14:36:51 +0200
commit798d32edad091b6faaa1627a7514868fc28c5fd9 (patch)
tree0de0a50532b354a82f083af7694df30d40e362e4 /src/routines
parenteaa348735ee5cee396f9ec629f1486ebb3dbeff7 (diff)
Improved the GEMM direct kernel by adding register blocking. Still not fast though
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/level3/xgemm.cpp11
1 files changed, 7 insertions, 4 deletions
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 4bdf3192..d6ba2c32 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -284,10 +284,13 @@ StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
kernel.SetArgument(18, static_cast<int>(b_conjugate));
// Computes the global and local thread sizes
- const auto m_ceiled = Ceil(m, 16);
- const auto n_ceiled = Ceil(n, 16);
- const auto global = std::vector<size_t>{m_ceiled, n_ceiled};
- const auto local = std::vector<size_t>{16, 16};
+ const auto m_ceiled = Ceil(m, db_["MWG"]);
+ const auto n_ceiled = Ceil(n, db_["NWG"]);
+ const auto global = std::vector<size_t>{
+ (m_ceiled * db_["MDIMC"]) / db_["MWG"],
+ (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+ };
+ const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
// Launches the kernel
auto status = RunKernel(kernel, queue_, device_, global, local, event_);