From b894611ad196fc9cac40bf5861a23b35c52c52b5 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 14 Jun 2016 18:17:58 +0200 Subject: Re-organised the level-3 supporting kernels (copy, pad, transpose, convert) and renamed files and functions appropriately --- src/kernels/level3/convert_hermitian.opencl | 106 ++++++++ src/kernels/level3/convert_symmetric.opencl | 94 +++++++ src/kernels/level3/convert_triangular.opencl | 98 ++++++++ src/kernels/level3/copy.opencl | 73 ------ src/kernels/level3/copy_fast.opencl | 56 +++++ src/kernels/level3/copy_pad.opencl | 109 +++++++++ src/kernels/level3/level3.opencl | 81 ++++++ src/kernels/level3/pad.opencl | 353 --------------------------- src/kernels/level3/padtranspose.opencl | 164 ------------- src/kernels/level3/transpose.opencl | 149 ----------- src/kernels/level3/transpose_fast.opencl | 134 ++++++++++ src/kernels/level3/transpose_pad.opencl | 158 ++++++++++++ 12 files changed, 836 insertions(+), 739 deletions(-) create mode 100644 src/kernels/level3/convert_hermitian.opencl create mode 100644 src/kernels/level3/convert_symmetric.opencl create mode 100644 src/kernels/level3/convert_triangular.opencl delete mode 100644 src/kernels/level3/copy.opencl create mode 100644 src/kernels/level3/copy_fast.opencl create mode 100644 src/kernels/level3/copy_pad.opencl create mode 100644 src/kernels/level3/level3.opencl delete mode 100644 src/kernels/level3/pad.opencl delete mode 100644 src/kernels/level3/padtranspose.opencl delete mode 100644 src/kernels/level3/transpose.opencl create mode 100644 src/kernels/level3/transpose_fast.opencl create mode 100644 src/kernels/level3/transpose_pad.opencl (limited to 'src/kernels') diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl new file mode 100644 index 00000000..53cc161a --- /dev/null +++ b/src/kernels/level3/convert_hermitian.opencl @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains kernels to convert hermitian matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) + +// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void HermLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains kernels to convert symmetric matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_SYMM) + +// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains kernels to convert triangular matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRMM) + +// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void TriaLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef COPY_DIMX - #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef COPY_DIMY - #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef COPY_WPT - #define COPY_WPT 1 // Work per thread in the first dimension (x) -#endif -#ifndef COPY_VW - #define COPY_VW 1 // Vector width in the second dimension (y) -#endif - -// ================================================================================================= - -// Data-widths -#if COPY_VW == 1 - typedef real realC; -#elif COPY_VW == 2 - typedef real2 realC; -#elif COPY_VW == 4 - typedef real4 realC; -#elif COPY_VW == 8 - typedef real8 realC; -#elif COPY_VW == 16 - typedef real16 realC; -#endif - -// ================================================================================================= - -// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of -// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrix(const int ld, - __global const realC* restrict src, - __global realC* dest) { - #pragma unroll - for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Data-widths +#if COPY_VW == 1 + typedef real realC; +#elif COPY_VW == 2 + typedef real2 realC; +#elif COPY_VW == 4 + typedef real4 realC; +#elif COPY_VW == 8 + typedef real8 realC; +#elif COPY_VW == 16 + typedef real16 realC; +#endif + +// ================================================================================================= + +// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of +// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. +__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +__kernel void CopyMatrixFast(const int ld, + __global const realC* restrict src, + __global realC* dest) { + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + #endif + if (condition) { + + // Copies the value into the destination matrix. This is always within bounds of the source + // matrix, as we know that the destination matrix is smaller or equal to the source. + if (id_two < dest_two && id_one < dest_one) { + real value = src[id_two*src_ld + id_one + src_offset]; + if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl new file mode 100644 index 00000000..bf14ab12 --- /dev/null +++ b/src/kernels/level3/level3.opencl @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common functions and parameters specific for level 3 BLAS kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. + +// For the 'fast' copy kernel +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// For the padding/copy kernels and the conversion kernels +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// For the 'fast' transpose kernel +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif +#ifndef TRA_SHUFFLE + #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts +#endif + +// For the padding/transpose kernels +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl deleted file mode 100644 index eefddce4..00000000 --- a/src/kernels/level3/pad.opencl +++ /dev/null @@ -1,353 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy and pad matrices in various ways, including: -// 1) copying into a larger matrix by adding padding -// 2) copying into a smaller matrix by removing padding -// 3) from upper/lower triangle into a full matrix -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PAD_DIMX - #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef PAD_DIMY - #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef PAD_WPTX - #define PAD_WPTX 1 // Work per thread in the first dimension (x) -#endif -#ifndef PAD_WPTY - #define PAD_WPTY 1 // Work per thread in the second dimension (y) -#endif - -// ================================================================================================= - -// Copies a matrix from source to destination. The output is padded with zero values in case the -// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld -// value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void PadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one= id_one); } - else if (lower == 1) { condition = (id_two <= id_one); } - if (condition) { - - // Copies the value into the destination matrix. This is always within bounds of the source - // matrix, as we know that the destination matrix is smaller than the source. - if (id_two < dest_two && id_one < dest_one) { - real value = src[id_two*src_ld + id_one + src_offset]; - if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= -#if defined(ROUTINE_SYMM) - -// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices in various ways, including: -// 1) transposing into a larger matrix by adding padding -// 2) transposing into a smaller matrix by removing padding -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PADTRA_TILE - #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef PADTRA_WPT - #define PADTRA_WPT 1 // Amount of work per thread -#endif -#ifndef PADTRA_PAD - #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif - -// ================================================================================================= - -// Same as PadCopyMatrix, but now also does the transpose -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void PadTransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Local memory to store a tile of the matrix (for coalescing) - __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; - - // Loop over the work per thread - #pragma unroll - for (int w_one=0; w_one= id_dest_two); } - else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } - if (condition) { - - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose.opencl deleted file mode 100644 index d726f7ec..00000000 --- a/src/kernels/level3/transpose.opencl +++ /dev/null @@ -1,149 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef TRA_DIM - #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef TRA_WPT - #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other -#endif -#ifndef TRA_PAD - #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif -#ifndef TRA_SHUFFLE - #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts -#endif - -// ================================================================================================= - -// Data-widths -#if TRA_WPT == 1 - typedef real realT; -#elif TRA_WPT == 2 - typedef real2 realT; -#elif TRA_WPT == 4 - typedef real4 realT; -#elif TRA_WPT == 8 - typedef real8 realT; -#elif TRA_WPT == 16 - typedef real16 realT; -#endif - -// ================================================================================================= - -// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without -// offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrix(const int ld, - __global const realT* restrict src, - __global realT* dest) { - - // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different - // way over workgroups, breaking memory-bank dependencies. - const int gid0 = get_group_id(0); - #if TRA_SHUFFLE == 1 - const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - #else - const int gid1 = get_group_id(1); - #endif - - // Local memory to store a tile of the matrix (for coalescing) - __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; - - // Loops over the work per thread - #pragma unroll - for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// a kernel to transpose matrices. This is a 'fast' version with restrictions, see the +// 'padtranspose.opencl' file for a general transpose kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Data-widths +#if TRA_WPT == 1 + typedef real realT; +#elif TRA_WPT == 2 + typedef real2 realT; +#elif TRA_WPT == 4 + typedef real4 realT; +#elif TRA_WPT == 8 + typedef real8 realT; +#elif TRA_WPT == 16 + typedef real16 realT; +#endif + +// ================================================================================================= + +// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without +// offset. A more general version is available in 'padtranspose.opencl'. +__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +__kernel void TransposeMatrixFast(const int ld, + __global const realT* restrict src, + __global realT* dest) { + + // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different + // way over workgroups, breaking memory-bank dependencies. + const int gid0 = get_group_id(0); + #if TRA_SHUFFLE == 1 + const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + #else + const int gid1 = get_group_id(1); + #endif + + // Local memory to store a tile of the matrix (for coalescing) + __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; + + // Loops over the work per thread + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices in various ways, including: +// 1) transposing into a larger matrix by adding padding +// 2) transposing into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'transpose.opencl' file for a faster but more restricted +// transpose kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Transposes a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the transposed source matrix dimensions. +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void TransposePadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + #endif + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= -- cgit v1.2.3