summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/clblast.pc.in10
-rw-r--r--src/kernels/level3/xgemm_part3.opencl4
-rw-r--r--src/routines/level3/xher2k.cpp2
-rw-r--r--src/routines/level3/xherk.cpp2
-rw-r--r--src/routines/level3/xsyr2k.cpp2
-rw-r--r--src/routines/level3/xsyrk.cpp2
6 files changed, 16 insertions, 6 deletions
diff --git a/src/clblast.pc.in b/src/clblast.pc.in
new file mode 100644
index 00000000..2538add8
--- /dev/null
+++ b/src/clblast.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLBlast
+Description: CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11
+Version: @clblast_VERSION_MAJOR@.@clblast_VERSION_MINOR@.@clblast_VERSION_PATCH@
+Libs: -L${libdir} -lclblast
+Cflags: -I${includedir}
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index a5faef5a..8ac3a3a8 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -113,7 +113,7 @@ void XgemmUpper(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the upper-triangle
- if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+ if ((GetGroupID1() + 1)*NWG < GetGroupID0()*MWG) {
return;
}
@@ -153,7 +153,7 @@ void XgemmLower(const int kSizeN, const int kSizeK,
const real beta = GetRealArg(arg_beta);
// Skip these threads if they do not contain threads contributing to the lower-triangle
- if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+ if (GetGroupID1()*NWG > (GetGroupID0() + 1)*MWG) {
return;
}
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index 7244c848..ee3bb8b8 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -75,7 +75,7 @@ void Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Tr
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 865c6c37..ae8e9324 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -73,7 +73,7 @@ void Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Tran
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 826854a8..cb0e0461 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -71,7 +71,7 @@ void Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Tran
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 9aa8ca2d..bd6c4b25 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -68,7 +68,7 @@ void Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transp
TestMatrixC(n, n, c_buffer, c_offset, c_ld);
// Calculates the ceiled versions of n and k
- auto n_ceiled = Ceil(n, db_["NWG"]);
+ auto n_ceiled = Ceil(Ceil(n, db_["MWG"]), db_["NWG"]);
auto k_ceiled = Ceil(k, db_["KWG"]);
// Decides which kernel to run: the upper-triangular or lower-triangular version