Merge pull request #304 from CNugteren/CLBlast-300-fix-staggered-indices-AMD-GEMMK1

Fix staggered indices on AMD GPUs for GEMMK == 1 kernel
author: Cedric Nugteren <web@cedricnugteren.nl> 2018-07-28 15:29:16 +0200
committer: GitHub <noreply@github.com> 2018-07-28 15:29:16 +0200
commit: dda1e567f872d3d89f2f7cd890fb5b29ff98537c (patch)
tree: 944ee44ac071bd8ca6da9f4a16b9b8c9bba1889f
parent: f8fb707fa440d1ce8b319bec8efe3c20d21dcd37 (diff)
parent: 0f0baa561b6c215a1052b5c70d72215e2ab38745 (diff)
4 files changed, 18 insertions, 4 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 4ad70a95..c1c639e1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,9 @@
 
+Development (next version)
+- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
+- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
+- Various minor fixes and enhancements
+
 Version 1.4.1
 - Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
 - Fixed an issue with double cl_program release in the CLBlast caching system
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 4a476a8b..0ad38919 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -260,7 +260,7 @@ R"(
 // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
 // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
 // More details: https://github.com/CNugteren/CLBlast/issues/53
-#if USE_STAGGERED_INDICES == 1
+#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
   INLINE_FUNC int GetGroupIDFlat() {
     return get_group_id(0) + get_num_groups(0) * get_group_id(1);
   }
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 35ec735c..90de0b3b 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -91,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
   #if GEMMK == 1
     const __global real* restrict a_ptr = (const __global real* restrict) &agm[0];
     const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0];
-    const int tid_x = get_global_id(0);
-    const int tid_y = get_global_id(1);
+    const int tid_x = get_local_id(0) + MDIMC * GetGroupID0();
+    const int tid_y = get_local_id(1) + NDIMC * GetGroupID1();
   #endif
 
   // Combined thread identifier (volatile to disable caching)
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 216f4b31..822f8851 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -342,8 +342,17 @@ void Tuner(int argc, char* argv[], const int V,
   const auto best_time_ms = best_configuration->score;
   if (best_time_ms == 0.0) { return; }
 
-  // Also prints the performance of the best-case in terms of GB/s or GFLOPS
+  // Computes and prints some other statistics
+  auto average_ms = 0.0;
+  for (const auto result : results) { average_ms += result.score; }
+  average_ms /= results.size();
   printf("\n");
+  printf("* Got average result of %.2lf ms", average_ms);
+  printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6),
+         settings.performance_unit.c_str());
+
+
+  // Also prints the performance of the best-case in terms of GB/s or GFLOPS
   printf("* Found best result %.2lf ms", best_time_ms);
   printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
          settings.performance_unit.c_str());
author	Cedric Nugteren <web@cedricnugteren.nl>	2018-07-28 15:29:16 +0200
committer	GitHub <noreply@github.com>	2018-07-28 15:29:16 +0200
commit	dda1e567f872d3d89f2f7cd890fb5b29ff98537c (patch)
tree	944ee44ac071bd8ca6da9f4a16b9b8c9bba1889f
parent	f8fb707fa440d1ce8b319bec8efe3c20d21dcd37 (diff)
parent	0f0baa561b6c215a1052b5c70d72215e2ab38745 (diff)