Added workgroup shuffle option to transpose kernel for AMD GPUs

author: CNugteren <web@cedricnugteren.nl> 2015-07-22 07:31:16 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-07-22 07:31:16 +0200
commit: 4dcecfe93476c2cbd1148bdbb3d327d9ad444af2 (patch)
tree: 2b0467ba54bcd00fcf8a3dd1ce0bd80e2d2d70c7 /src/kernels
parent: d93efa31691e1a34e16865b7b96624fca670bab2 (diff)
1 files changed, 20 insertions, 8 deletions
diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl
index 418e0ff9..2aa53bb8 100644
--- a/src/kernels/transpose.opencl
+++ b/src/kernels/transpose.opencl
@@ -20,13 +20,16 @@ R"(
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
 #ifndef TRA_DIM
-  #define TRA_DIM 8    // Number of local threads in the two dimensions (x,y)
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
 #endif
 #ifndef TRA_WPT
-  #define TRA_WPT 1    // Work per thread in one dimension and vector-width in the other
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
 #endif
 #ifndef TRA_PAD
-  #define TRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
 #endif
 
 // =================================================================================================
@@ -53,17 +56,26 @@ __kernel void TransposeMatrix(const int ld,
                               __global const realT* restrict src,
                               __global realT* dest) {
 
+  // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
+  // way over workgroups, breaking memory-bank dependencies.
+  const int gid0 = get_group_id(0);
+  #if TRA_SHUFFLE == 1
+    const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0);
+  #else
+    const int gid1 = get_group_id(1);
+  #endif
+
   // Local memory to store a tile of the matrix (for coalescing)
   __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD];
 
-  // Loop over the work per thread
+  // Loops over the work per thread
   #pragma unroll
   for (int w_one=0; w_one<TRA_WPT; ++w_one) {
 
     // Computes the identifiers for the source matrix. Note that the local and global dimensions
     // do not correspond to each other!
-    const int id_one = get_group_id(1) * TRA_DIM + get_local_id(0);
-    const int id_two = (get_group_id(0) * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
+    const int id_one = gid1 * TRA_DIM + get_local_id(0);
+    const int id_two = (gid0 * TRA_DIM + get_local_id(1))*TRA_WPT + w_one;
 
     // Loads data into the local memory
     realT value = src[id_two*(ld/TRA_WPT) + id_one];
@@ -123,8 +135,8 @@ __kernel void TransposeMatrix(const int ld,
   // Stores the results into the destination matrix
   #pragma unroll
   for (int w_two=0; w_two<TRA_WPT; ++w_two) {
-    const int id_one = get_global_id(0);
-    const int id_two = get_global_id(1)*TRA_WPT + w_two;
+    const int id_one = gid0*TRA_DIM + get_local_id(0);
+    const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
     dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
   }
 }
author	CNugteren <web@cedricnugteren.nl>	2015-07-22 07:31:16 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-07-22 07:31:16 +0200
commit	4dcecfe93476c2cbd1148bdbb3d327d9ad444af2 (patch)
tree	2b0467ba54bcd00fcf8a3dd1ce0bd80e2d2d70c7 /src/kernels
parent	d93efa31691e1a34e16865b7b96624fca670bab2 (diff)