From 2fd04dae83acb01933856e768a938db9ac808ce0 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 19 Mar 2017 15:57:44 +0100 Subject: Added batched versions of the pad/copy/transpose kernels --- src/kernels/level3/copy_pad.opencl | 110 ++++++++++++++++++++----- src/kernels/level3/transpose_pad.opencl | 138 +++++++++++++++++++++++++------- src/routines/common.hpp | 64 +++++++++++++++ 3 files changed, 260 insertions(+), 52 deletions(-) (limited to 'src') diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 29480b25..93b89187 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,16 +24,14 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -void CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { - const real alpha = GetRealArg(arg_alpha); +inline void _CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loops over the work per thread in both dimensions #pragma unroll @@ -60,22 +58,36 @@ void CopyPadMatrix(const int src_one, const int src_two, } } +// Interface to the above function +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { + const real alpha = GetRealArg(arg_alpha); + _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, do_conjugate); +} + // ================================================================================================= // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -void CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { - const real alpha = GetRealArg(arg_alpha); +inline void _CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loops over the work per thread in both dimensions #pragma unroll @@ -105,6 +117,62 @@ void CopyMatrix(const int src_one, const int src_two, } } +// Interface to the above function +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { + const real alpha = GetRealArg(arg_alpha); + _CopyMatrix(src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, upper, lower, diagonal_imag_zero); +} + +// ================================================================================================= +#if defined(ROUTINE_GEMMBATCHED) + +// Batched version of the above +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyPadMatrixBatched(const int src_one, const int src_two, + const int src_ld, const __constant int* src_offsets, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const __constant int* dest_offsets, + __global real* dest, + const int do_conjugate) { + const int batch = get_group_id(2); + const int src_offset = src_offsets[batch]; + const int dest_offset = dest_offsets[batch]; + real alpha; SetToOne(alpha); + _CopyPadMatrix(src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, do_conjugate); +} + +// Batched version of the above +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyMatrixBatched(const int src_one, const int src_two, + const int src_ld, const __constant int* src_offsets, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const __constant int* dest_offsets, + __global real* dest) { + const int batch = get_group_id(2); + const int src_offset = src_offsets[batch]; + const int dest_offset = dest_offsets[batch]; + real alpha; SetToOne(alpha); + _CopyMatrix(src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, 0, 0, 0); +} + +#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index ba0b7062..fb60ce75 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,19 +24,15 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -void TransposePadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { - const real alpha = GetRealArg(arg_alpha); - - // Local memory to store a tile of the matrix (for coalescing) - __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; +inline void _TransposePadMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int do_conjugate) { // Loop over the work per thread #pragma unroll @@ -56,7 +52,9 @@ void TransposePadMatrix(const int src_one, const int src_two, if (id_src_two < src_two && id_src_one < src_one) { value = src[id_src_two*src_ld + id_src_one + src_offset]; } - tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value; + const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one; + const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two; + tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value; } } @@ -75,7 +73,9 @@ void TransposePadMatrix(const int src_one, const int src_two, // Stores the transposed value in the destination matrix if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one; + const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two; + real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0]; if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } @@ -83,25 +83,38 @@ void TransposePadMatrix(const int src_one, const int src_two, } } +// Interface to the above function +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposePadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { + const real alpha = GetRealArg(arg_alpha); + __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; + _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, do_conjugate); +} + // ================================================================================================= // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -void TransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { - const real alpha = GetRealArg(arg_alpha); - - // Local memory to store a tile of the matrix (for coalescing) - __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; +inline void _TransposeMatrix(__local real* tile, + const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { // Loop over the work per thread #pragma unroll @@ -117,7 +130,9 @@ void TransposeMatrix(const int src_one, const int src_two, // Loads data into the local memory if the thread IDs are within bounds of the source matrix. if ((id_src_one < src_one) && (id_src_two < src_two)) { real value = src[id_src_two*src_ld + id_src_one + src_offset]; - tile[get_local_id(1)*PADTRA_WPT + w_two][get_local_id(0)*PADTRA_WPT + w_one] = value; + const int tile_id0 = get_local_id(0)*PADTRA_WPT + w_one; + const int tile_id1 = get_local_id(1)*PADTRA_WPT + w_two; + tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0] = value; } } } @@ -145,7 +160,9 @@ void TransposeMatrix(const int src_one, const int src_two, // Stores the transposed value in the destination matrix if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + const int tile_id0 = get_local_id(1)*PADTRA_WPT + w_one; + const int tile_id1 = get_local_id(0)*PADTRA_WPT + w_two; + real value = tile[tile_id1 * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD) + tile_id0]; if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value); } @@ -154,6 +171,65 @@ void TransposeMatrix(const int src_one, const int src_two, } } +// Interface to the above function +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { + const real alpha = GetRealArg(arg_alpha); + __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; + _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, upper, lower, diagonal_imag_zero); +} + +// ================================================================================================= +#if defined(ROUTINE_GEMMBATCHED) + +// Batched version of the above +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposePadMatrixBatched(const int src_one, const int src_two, + const int src_ld, const __constant int* src_offsets, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const __constant int* dest_offsets, + __global real* dest, + const int do_conjugate) { + const int batch = get_group_id(2); + const int src_offset = src_offsets[batch]; + const int dest_offset = dest_offsets[batch]; + real alpha; SetToOne(alpha); + __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; + _TransposePadMatrix(tile, src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, do_conjugate); +} + +// Batched version of the above +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposeMatrixBatched(const int src_one, const int src_two, + const int src_ld, const __constant int* src_offsets, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const __constant int* dest_offsets, + __global real* dest) { + const int batch = get_group_id(2); + const int src_offset = src_offsets[batch]; + const int dest_offset = dest_offsets[batch]; + real alpha; SetToOne(alpha); + __local real tile[(PADTRA_WPT*PADTRA_TILE) * (PADTRA_WPT*PADTRA_TILE + PADTRA_PAD)]; + _TransposeMatrix(tile, src_one, src_two, src_ld, src_offset, src, + dest_one, dest_two, dest_ld, dest_offset, dest, + alpha, 0, 0, 0); +} + +#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/routines/common.hpp b/src/routines/common.hpp index be6ac4ec..28a43da5 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -196,6 +196,70 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device, } } +// Batched version of the above +template +void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device, + const Databases &db, + EventPointer event, const std::vector &waitForEvents, + const size_t src_one, const size_t src_two, + const size_t src_ld, const Buffer &src_offsets, + const Buffer &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const Buffer &dest_offsets, + const Buffer &dest, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const size_t batch_count) { + + // Determines the right kernel + auto kernel_name = std::string{}; + if (do_transpose) { + kernel_name = (do_pad) ? "TransposePadMatrixBatched" : "TransposeMatrixBatched"; + } + else { + kernel_name = (do_pad) ? "CopyPadMatrixBatched" : "CopyMatrixBatched"; + } + + // Retrieves the kernel from the compiled binary + auto kernel = Kernel(program, kernel_name); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(src_one)); + kernel.SetArgument(1, static_cast(src_two)); + kernel.SetArgument(2, static_cast(src_ld)); + kernel.SetArgument(3, src_offsets()); + kernel.SetArgument(4, src()); + kernel.SetArgument(5, static_cast(dest_one)); + kernel.SetArgument(6, static_cast(dest_two)); + kernel.SetArgument(7, static_cast(dest_ld)); + kernel.SetArgument(8, dest_offsets()); + kernel.SetArgument(9, dest()); + if (do_pad) { + kernel.SetArgument(10, static_cast(do_conjugate)); + } + + // Launches the kernel and returns the error code. Uses global and local thread sizes based on + // parameters in the database. + if (do_transpose) { + const auto global = std::vector{ + Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"]), + batch_count + }; + const auto local = std::vector{db["PADTRA_TILE"], db["PADTRA_TILE"], 1}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } + else { + const auto global = std::vector{ + Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"]), + batch_count + }; + const auto local = std::vector{db["PAD_DIMX"], db["PAD_DIMY"], 1}; + RunKernel(kernel, queue, device, global, local, event, waitForEvents); + } +} + // ================================================================================================= } // namespace clblast -- cgit v1.2.3