From 489c5d76cfe95a97542dfeaa6d8b19cd9100919a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 18 May 2016 21:32:56 +0200 Subject: Merged in latest changes from 0.7.1 release --- src/kernels/common.opencl | 26 +++++++ src/kernels/level3/xgemm_part1.opencl | 8 +- src/kernels/level3/xgemm_part2.opencl | 138 +++++++++++++++++----------------- src/routine.cc | 11 ++- src/routines/level3/xgemm.cc | 5 +- 5 files changed, 114 insertions(+), 74 deletions(-) (limited to 'src') diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index 01605f6e..08c47d87 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -192,6 +192,32 @@ R"( // ================================================================================================= +// Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is +// enabled (see src/routine.cc). +#ifndef USE_STAGGERED_INDICES + #define USE_STAGGERED_INDICES 0 +#endif + +// Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: +// http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf +// More details: https://github.com/CNugteren/CLBlast/issues/53 +#if USE_STAGGERED_INDICES == 1 + inline size_t GetGroupIDFlat() { + return get_group_id(0) + get_num_groups(0) * get_group_id(1); + } + inline size_t GetGroupID1() { + return (GetGroupIDFlat()) % get_num_groups(1); + } + inline size_t GetGroupID0() { + return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); + } +#else + inline size_t GetGroupID1() { return get_group_id(1); } + inline size_t GetGroupID0() { return get_group_id(0); } +#endif + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 4cb0585b..a2a555de 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -199,7 +199,7 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al // Computes the indices for the global memory int kg = kia + la1*KWA; - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); int idk = kg + kwg; // Loads the data from global memory (not transposed) into the local memory @@ -229,7 +229,7 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl // Computes the indices for the global memory int kg = kib + lb1*KWB; - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); int idk = kg + kwg; // Loads the data from global memory (transposed) into the local memory @@ -257,7 +257,7 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V #endif // Computes the indices for the global memory - int idm = mg + get_group_id(0)*(MWG/VWM); + int idm = mg + GetGroupID0() * (MWG/VWM); // Loads the data from global memory (not transposed) and stores into registers apm[mi] = agm[idk*(kSizeM/VWM) + idm]; @@ -280,7 +280,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V #endif // Computes the indices for the global memory - int idn = ng + get_group_id(1)*(NWG/VWN); + int idn = ng + GetGroupID1() * (NWG/VWN); // Loads the data from global memory (transposed) and stores into registers bpm[ni] = bgm[idk*(kSizeN/VWN) + idn]; diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index a8c8ebf5..56ccdb96 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -69,42 +69,43 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real for (int ni=0; ni get_group_id(0)*MWG) { + if (GetGroupID1()*NWG > GetGroupID0()*MWG) { return; } diff --git a/src/routine.cc b/src/routine.cc index 5f9b1c89..11c4281e 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -88,12 +88,21 @@ StatusCode Routine::SetUp() { // Adds the name of the routine as a define defines += "#define ROUTINE_"+routine_name_+"\n"; + // Determines whether this is a specific device + const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc."; + const auto isGPU = device_.Type() == "GPU"; + // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve // performance, but might result in a reduced accuracy. - if (device_.Vendor() == "AMD") { + if (isAMD && isGPU) { defines += "#define USE_CL_MAD 1\n"; } + // For specific devices, use staggered/shuffled workgroup indices. + if (isAMD && isGPU) { + defines += "#define USE_STAGGERED_INDICES 1\n"; + } + // Combines everything together into a single source string auto source_string = defines + common_header + source_string_; diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 5395667a..ab36076c 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -191,12 +191,13 @@ StatusCode Xgemm::DoGemm(const Layout layout, // Launches the kernel auto eventKernel = Event(); - status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList); + auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_; + status = RunKernel(kernel, global, local, eventPointer, eventWaitList); if (ErrorIn(status)) { return status; } - eventWaitList.push_back(eventKernel); // Runs the post-processing kernel if needed if (!c_no_temp) { + eventWaitList.push_back(eventKernel); status = PadCopyTransposeMatrix(event_, eventWaitList, m_ceiled, n_ceiled, m_ceiled, 0, c_temp, c_one, c_two, c_ld, c_offset, c_buffer, -- cgit v1.2.3