diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-11-30 20:23:26 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2018-11-30 20:23:26 +0100 |
commit | c0e41b87cb772d43ab8bf35d650d7a98037f155d (patch) | |
tree | 3140ec86cefbfe0ade6cb5a3fc02afebe86780d4 | |
parent | bca1506e870ccc256e7dcf8aaf19f900652e91ba (diff) |
Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 6 |
2 files changed, 5 insertions, 2 deletions
@@ -7,6 +7,7 @@ Development (next version) - Fixed an issue with conjugate transpose not being executed in certain cases for a.o. XOMATCOPY - Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel - Fixed an issue with the preprocessor and the new GEMMK == 1 kernel +- Fixed an issue for unequal MWG and NWG and the new GEMMK == 1 kernel - Fixed an issue for certain parameters for AXPY's 'XaxpyFaster' kernel - Various minor fixes and enhancements - Added non-BLAS routines: diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index cb24460a..6daa0fcf 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -216,9 +216,11 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k, kernel.SetArgument(9, static_cast<int>(c_temp_offset / db_["VWM"])); // Computes the global and local thread sizes + const auto global_divider_one = c_want_rotated_(db_["GEMMK"]) ? db_["NWG"] : db_["MWG"]; + const auto global_divider_two = c_want_rotated_(db_["GEMMK"]) ? db_["MWG"] : db_["NWG"]; const auto global = std::vector<size_t>{ - (c_one_i * db_["MDIMC"]) / db_["MWG"], - (c_two_i * db_["NDIMC"]) / db_["NWG"] + (c_one_i * db_["MDIMC"]) / global_divider_one, + (c_two_i * db_["NDIMC"]) / global_divider_two }; const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; |