diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-07-28 15:29:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-28 15:29:16 +0200 |
commit | dda1e567f872d3d89f2f7cd890fb5b29ff98537c (patch) | |
tree | 944ee44ac071bd8ca6da9f4a16b9b8c9bba1889f | |
parent | f8fb707fa440d1ce8b319bec8efe3c20d21dcd37 (diff) | |
parent | 0f0baa561b6c215a1052b5c70d72215e2ab38745 (diff) |
Merge pull request #304 from CNugteren/CLBlast-300-fix-staggered-indices-AMD-GEMMK1
Fix staggered indices on AMD GPUs for GEMMK == 1 kernel
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | src/kernels/common.opencl | 2 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part3.opencl | 4 | ||||
-rw-r--r-- | src/tuning/tuning.cpp | 11 |
4 files changed, 18 insertions, 4 deletions
@@ -1,4 +1,9 @@ +Development (next version) +- Added support for shuffle instructions for NVIDIA GPUs (thanks to 'tyler-utah') +- Fixed an issue with AMD GPUs and the new GEMMK == 1 kernel +- Various minor fixes and enhancements + Version 1.4.1 - Fixed an access violation under Windows upon releasing the OpenCL program when the driver is already unloaded - Fixed an issue with double cl_program release in the CLBlast caching system diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 4a476a8b..0ad38919 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -260,7 +260,7 @@ R"( // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf // More details: https://github.com/CNugteren/CLBlast/issues/53 -#if USE_STAGGERED_INDICES == 1 +#if USE_STAGGERED_INDICES == 1 && GEMMK == 0 INLINE_FUNC int GetGroupIDFlat() { return get_group_id(0) + get_num_groups(0) * get_group_id(1); } diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 35ec735c..90de0b3b 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -91,8 +91,8 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #if GEMMK == 1 const __global real* restrict a_ptr = (const __global real* restrict) &agm[0]; const __global real* restrict b_ptr = (const __global real* restrict) &bgm[0]; - const int tid_x = get_global_id(0); - const int tid_y = get_global_id(1); + const int tid_x = get_local_id(0) + MDIMC * GetGroupID0(); + const int tid_y = get_local_id(1) + NDIMC * GetGroupID1(); #endif // Combined thread identifier (volatile to disable caching) diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index 216f4b31..822f8851 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -342,8 +342,17 @@ void Tuner(int argc, char* argv[], const int V, const auto best_time_ms = best_configuration->score; if (best_time_ms == 0.0) { return; } - // Also prints the performance of the best-case in terms of GB/s or GFLOPS + // Computes and prints some other statistics + auto average_ms = 0.0; + for (const auto result : results) { average_ms += result.score; } + average_ms /= results.size(); printf("\n"); + printf("* Got average result of %.2lf ms", average_ms); + printf(": %.1lf %s\n", settings.metric_amount / (average_ms * 1.0e6), + settings.performance_unit.c_str()); + + + // Also prints the performance of the best-case in terms of GB/s or GFLOPS printf("* Found best result %.2lf ms", best_time_ms); printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6), settings.performance_unit.c_str()); |