4 files changed, 24 insertions, 21 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 32a05b00..089e3fd8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,7 @@
 
+Development version (next release)
+- Fixed a bug when using offsets in the direct version of the GEMM kernels
+
 Version 0.10.0
 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
 - Changed the enums in the C API to avoid potential name clashes with external code
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index d77cbf65..fc09307e 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -42,7 +42,7 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
       int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
 
       // Loads the data from global memory into the local memory
-      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + (a_offset/VWMD)];
       #if VWMD == 1
          alm[kg*(WGD + PADA) + mg] = avec;
       #elif VWMD == 2
@@ -113,7 +113,7 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
       int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
 
       // Loads the data from global memory into the local memory
-      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + (b_offset/VWND)];
       #if VWND == 1
          blm[kg*(WGD + PADB) + ng] = bvec;
       #elif VWND == 2
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index a9350e00..c04cdeb8 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -53,13 +53,13 @@ inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
     for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
 
       // Loads data: off-chip --> local (matrix A and B)
-      if (a_ld % VWMD == 0) {
+      if (a_ld % VWMD == 0 && a_offset % VWMD == 0) {
         GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
       }
       else {
         GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
       }
-      if (b_ld % VWND == 0) {
+      if (b_ld % VWND == 0 && b_offset % VWND == 0) {
         GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
       }
       else {
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 4f70dc7a..0015b629 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -126,16 +126,16 @@ void Xgemm<T>::DoGemm(const Layout layout,
 // overhead of these extra kernels might not be ideal for certain devices/arguments.
 template <typename T>
 void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
-                                  const T alpha,
-                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                  const T beta,
-                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                  const bool a_conjugate, const bool b_conjugate,
-                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
-                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
-                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+                            const T alpha,
+                            const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                            const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                            const T beta,
+                            const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                            const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                            const bool a_conjugate, const bool b_conjugate,
+                            const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                            const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                            const size_t c_one, const size_t c_two, const bool c_want_rotated) {
   // Calculates the ceiled versions of m, n, and k
   const auto m_ceiled = Ceil(m, db_["MWG"]);
   const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -247,13 +247,13 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
 // The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
 template <typename T>
 void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
-                                const T alpha,
-                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
-                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
-                                const T beta,
-                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
-                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
-                                const bool a_conjugate, const bool b_conjugate) {
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate) {
 
   // Loads the program from the database
   const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);