summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-07-28 15:29:16 +0200
committerGitHub <noreply@github.com>2018-07-28 15:29:16 +0200
commitdda1e567f872d3d89f2f7cd890fb5b29ff98537c (patch)
tree944ee44ac071bd8ca6da9f4a16b9b8c9bba1889f
parentf8fb707fa440d1ce8b319bec8efe3c20d21dcd37 (diff)
parent0f0baa561b6c215a1052b5c70d72215e2ab38745 (diff)
Merge pull request #304 from CNugteren/CLBlast-300-fix-staggered-indices-AMD-GEMMK1
Fix staggered indices on AMD GPUs for GEMMK == 1 kernel
-rw-r--r--CHANGELOG5
-rw-r--r--src/kernels/common.opencl2
-rw-r--r--src/kernels/level3/xgemm_part3.opencl4
-rw-r--r--src/tuning/tuning.cpp11
4 files changed, 18 insertions, 4 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 4ad70a95..c1c639e1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,9 @@
+Development (next version)
+- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah')
+- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel
+- Various minor fixes and enhancements
+
Version 1.4.1
- Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded
- Fixed an issue with double cl_program release in the CLBlast caching system
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 4a476a8b..0ad38919 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -260,7 +260,7 @@ R"(
// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
// More details: https://github.com/CNugteren/CLBlast/issues/53
-#if USE_STAGGERED_INDICES == 1
+#if USE_STAGGERED_INDICES == 1 && GEMMK == 0
INLINE_FUNC int GetGroupIDFlat() {
return get_group_id(0) + get_num_groups(0) * get_group_id(1);
}
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 35ec735c..90de0b3b 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -91,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#if GEMMK == 1
const __global real* restrict a_ptr = (const __global real* restrict) &agm[0];
const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0];
- const int tid_x = get_global_id(0);
- const int tid_y = get_global_id(1);
+ const int tid_x = get_local_id(0) + MDIMC * GetGroupID0();
+ const int tid_y = get_local_id(1) + NDIMC * GetGroupID1();
#endif
// Combined thread identifier (volatile to disable caching)
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index 216f4b31..822f8851 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -342,8 +342,17 @@ void Tuner(int argc, char* argv[], const int V,
const auto best_time_ms = best_configuration->score;
if (best_time_ms == 0.0) { return; }
- // Also prints the performance of the best-case in terms of GB/s or GFLOPS
+ // Computes and prints some other statistics
+ auto average_ms = 0.0;
+ for (const auto result : results) { average_ms += result.score; }
+ average_ms /= results.size();
printf("\n");
+ printf("* Got average result of %.2lf ms", average_ms);
+ printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6),
+ settings.performance_unit.c_str());
+
+
+ // Also prints the performance of the best-case in terms of GB/s or GFLOPS
printf("* Found best result %.2lf ms", best_time_ms);
printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6),
settings.performance_unit.c_str());