diff options
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 2 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xherk.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cpp | 2 |
6 files changed, 6 insertions, 5 deletions
@@ -3,6 +3,7 @@ Development version (next release) - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation - Fixed a bug in the tests and samples related to waiting for an invalid event +- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters - Added support for compilation under Visual Studio 2013 (MSVC++ 12.0) - Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS - Added an option to run tuned kernels multiple times to average execution times diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index a5faef5a..3b26e943 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK, const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the lower-triangle - if (GetGroupID1()*NWG > GetGroupID0()*MWG) { + if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) { return; } diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index ba770065..bf328729 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -79,7 +79,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co if (ErrorIn(status)) { return status; } // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 3063f3bc..77422526 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -76,7 +76,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons if (ErrorIn(status)) { return status; } // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 158cd9e5..badf3100 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -75,7 +75,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons if (ErrorIn(status)) { return status; } // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index e1a72ef6..438aa218 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -71,7 +71,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const if (ErrorIn(status)) { return status; } // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version |