summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG3
-rw-r--r--src/kernels/level3/xgemm_direct_part2.opencl4
-rw-r--r--src/kernels/level3/xgemm_direct_part3.opencl4
-rw-r--r--src/routines/level3/xgemm.cpp34
4 files changed, 24 insertions, 21 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 32a05b00..089e3fd8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,7 @@
+Development version (next release)
+- Fixed a bug when using offsets in the direct version of the GEMM kernels
+
Version 0.10.0
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index d77cbf65..fc09307e 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
- const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+ const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
#if VWMD == 1
alm[kg*(WGD + PADA) + mg] = avec;
#elif VWMD == 2
@@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
// Loads the data from global memory into the local memory
- const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+ const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
#if VWND == 1
blm[kg*(WGD + PADB) + ng] = bvec;
#elif VWND == 2
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index a9350e00..c04cdeb8 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
// Loads data: off-chip --> local (matrix A and B)
- if (a_ld % VWMD == 0) {
+ if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
else {
GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
}
- if (b_ld % VWND == 0) {
+ if (b_ld % VWND == 0 && b_offset % VWND == 0) {
GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
}
else {
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 4f70dc7a..0015b629 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -126,16 +126,16 @@ void Xgemm<T>::DoGemm(const Layout layout,
// overhead of these extra kernels might not be ideal for certain devices/arguments.
template <typename T>
void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate,
- const size_t a_one, const size_t a_two, const bool a_want_rotated,
- const size_t b_one, const size_t b_two, const bool b_want_rotated,
- const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate,
+ const size_t a_one, const size_t a_two, const bool a_want_rotated,
+ const size_t b_one, const size_t b_two, const bool b_want_rotated,
+ const size_t c_one, const size_t c_two, const bool c_want_rotated) {
// Calculates the ceiled versions of m, n, and k
const auto m_ceiled = Ceil(m, db_["MWG"]);
const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -247,13 +247,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
template <typename T>
void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
- const T beta,
- const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
- const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
- const bool a_conjugate, const bool b_conjugate) {
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+ const T beta,
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+ const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+ const bool a_conjugate, const bool b_conjugate) {
// Loads the program from the database
const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);