diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.pc.in | 10 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 4 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xherk.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cpp | 2 |
6 files changed, 16 insertions, 6 deletions
diff --git a/src/clblast.pc.in b/src/clblast.pc.in new file mode 100644 index 00000000..2538add8 --- /dev/null +++ b/src/clblast.pc.in @@ -0,0 +1,10 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +includedir=${prefix}/include +libdir=${exec_prefix}/lib + +Name: CLBlast +Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11 +Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@ +Libs: -L${libdir} -lclblast +Cflags: -I${includedir} diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index a5faef5a..8ac3a3a8 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK, const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the upper-triangle - if (GetGroupID1()*NWG < GetGroupID0()*MWG) { + if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) { return; } @@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK, const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the lower-triangle - if (GetGroupID1()*NWG > GetGroupID0()*MWG) { + if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) { return; } diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index 7244c848..ee3bb8b8 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -75,7 +75,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 865c6c37..ae8e9324 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -73,7 +73,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 826854a8..cb0e0461 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -71,7 +71,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index 9aa8ca2d..bd6c4b25 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -68,7 +68,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp TestMatrixC(n, n, c_buffer, c_offset, c_ld); // Calculates the ceiled versions of n and k - auto n_ceiled = Ceil(n, db_["NWG"]); + auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]); auto k_ceiled = Ceil(k, db_["KWG"]); // Decides which kernel to run: the upper-triangular or lower-triangular version |