diff options
author | CNugteren <web@cedricnugteren.nl> | 2015-06-23 08:09:07 +0200 |
---|---|---|
committer | CNugteren <web@cedricnugteren.nl> | 2015-06-23 08:09:07 +0200 |
commit | 20eb3506d63e21725974e16ae392cf0dd4bf4df5 (patch) | |
tree | 8742994285f53f94f727b1853404325103104393 /src | |
parent | 4c2a166bc5406b194108d3b31238e55ac6b99e3c (diff) |
Added a condition to update only lower/upper triangular parts in the un-pad kernels
Diffstat (limited to 'src')
-rw-r--r-- | src/kernels/pad.opencl | 14 | ||||
-rw-r--r-- | src/kernels/padtranspose.opencl | 18 | ||||
-rw-r--r-- | src/routine.cc | 10 | ||||
-rw-r--r-- | src/routines/xgemm.cc | 8 |
4 files changed, 36 insertions, 14 deletions
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl index 45eaef91..cce0c746 100644 --- a/src/kernels/pad.opencl +++ b/src/kernels/pad.opencl @@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Loops over the work per thread in both dimensions #pragma unroll @@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, #pragma unroll for (int w_two=0; w_two<PAD_WPTY; ++w_two) { const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_two && id_one < dest_one) { + + // Masking in case of triangular matrices: updates only the upper or lower part + bool condition = true; + if (upper == 1) { condition = (id_two >= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + if (condition) { // Copies the value into the destination matrix. This is always within bounds of the source // matrix, as we know that the destination matrix is smaller than the source. - dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + if (id_two < dest_two && id_one < dest_one) { + dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset]; + } } } } diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl index 2f2aabd6..7e923392 100644 --- a/src/kernels/padtranspose.opencl +++ b/src/kernels/padtranspose.opencl @@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, __global const real* restrict src, const int dest_one, const int dest_two, const int dest_ld, const int dest_offset, - __global real* dest) { + __global real* dest, + const int upper, const int lower) { // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0); const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1); - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + // Masking in case of triangular matrices: updates only the upper or lower part + bool condition = true; + if (upper == 1) { condition = (id_dest_one >= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } } } } diff --git a/src/routine.cc b/src/routine.cc index a4e0bb37..4b7ece41 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr const size_t dest_ld, const size_t dest_offset, const Buffer &dest, const bool do_transpose, const bool do_conjugate, - const bool pad, const Program &program) { + const bool pad, const bool upper, const bool lower, + const Program &program) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && - (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld); + (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) && + (upper == false) && (lower == false); // Determines the right kernel auto kernel_name = std::string{}; @@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr if (pad) { kernel.SetArgument(10, static_cast<int>(do_conjugate)); } + else { + kernel.SetArgument(10, static_cast<int>(upper)); + kernel.SetArgument(11, static_cast<int>(lower)); + } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc index 20cd2675..651ebb55 100644 --- a/src/routines/xgemm.cc +++ b/src/routines/xgemm.cc @@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // them up until they reach a certain multiple of size (kernel parameter dependent). status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer, m_ceiled, k_ceiled, m_ceiled, 0, temp_a, - a_do_transpose, a_conjugate, true, program); + a_do_transpose, a_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer, n_ceiled, k_ceiled, n_ceiled, 0, temp_b, - b_do_transpose, b_conjugate, true, program); + b_do_transpose, b_conjugate, true, false, false, program); if (ErrorIn(status)) { return status; } // Only necessary for matrix C if it used both as input and output if (beta != static_cast<T>(0)) { status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer, m_ceiled, n_ceiled, m_ceiled, 0, temp_c, - c_do_transpose, false, true, program); + c_do_transpose, false, true, false, false, program); if (ErrorIn(status)) { return status; } } @@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // Runs the post-processing kernel status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c, c_one, c_two, c_ld, c_offset, c_buffer, - c_do_transpose, false, false, program); + c_do_transpose, false, false, false, false, program); if (ErrorIn(status)) { return status; } // Successfully finished the computation |