From b894611ad196fc9cac40bf5861a23b35c52c52b5 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 14 Jun 2016 18:17:58 +0200 Subject: Re-organised the level-3 supporting kernels (copy, pad, transpose, convert) and renamed files and functions appropriately --- src/kernels/level3/convert_hermitian.opencl | 106 ++++++++ src/kernels/level3/convert_symmetric.opencl | 94 +++++++ src/kernels/level3/convert_triangular.opencl | 98 ++++++++ src/kernels/level3/copy.opencl | 73 ------ src/kernels/level3/copy_fast.opencl | 56 +++++ src/kernels/level3/copy_pad.opencl | 109 +++++++++ src/kernels/level3/level3.opencl | 81 ++++++ src/kernels/level3/pad.opencl | 353 --------------------------- src/kernels/level3/padtranspose.opencl | 164 ------------- src/kernels/level3/transpose.opencl | 149 ----------- src/kernels/level3/transpose_fast.opencl | 134 ++++++++++ src/kernels/level3/transpose_pad.opencl | 158 ++++++++++++ src/routine.cc | 8 +- src/routines/level3/xgemm.cc | 12 +- src/routines/level3/xher2k.cc | 9 +- src/routines/level3/xherk.cc | 9 +- src/routines/level3/xsyr2k.cc | 9 +- src/routines/level3/xsyrk.cc | 9 +- src/routines/level3/xtrmm.cc | 2 +- src/tuning/copy.cc | 119 --------- src/tuning/copy_fast.cc | 120 +++++++++ src/tuning/copy_pad.cc | 128 ++++++++++ src/tuning/pad.cc | 127 ---------- src/tuning/padtranspose.cc | 131 ---------- src/tuning/transpose.cc | 124 ---------- src/tuning/transpose_fast.cc | 125 ++++++++++ src/tuning/transpose_pad.cc | 132 ++++++++++ 27 files changed, 1374 insertions(+), 1265 deletions(-) create mode 100644 src/kernels/level3/convert_hermitian.opencl create mode 100644 src/kernels/level3/convert_symmetric.opencl create mode 100644 src/kernels/level3/convert_triangular.opencl delete mode 100644 src/kernels/level3/copy.opencl create mode 100644 src/kernels/level3/copy_fast.opencl create mode 100644 src/kernels/level3/copy_pad.opencl create mode 100644 src/kernels/level3/level3.opencl delete mode 100644 src/kernels/level3/pad.opencl delete mode 100644 src/kernels/level3/padtranspose.opencl delete mode 100644 src/kernels/level3/transpose.opencl create mode 100644 src/kernels/level3/transpose_fast.opencl create mode 100644 src/kernels/level3/transpose_pad.opencl delete mode 100644 src/tuning/copy.cc create mode 100644 src/tuning/copy_fast.cc create mode 100644 src/tuning/copy_pad.cc delete mode 100644 src/tuning/pad.cc delete mode 100644 src/tuning/padtranspose.cc delete mode 100644 src/tuning/transpose.cc create mode 100644 src/tuning/transpose_fast.cc create mode 100644 src/tuning/transpose_pad.cc (limited to 'src') diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl new file mode 100644 index 00000000..53cc161a --- /dev/null +++ b/src/kernels/level3/convert_hermitian.opencl @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains kernels to convert hermitian matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) + +// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void HermLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains kernels to convert symmetric matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_SYMM) + +// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains kernels to convert triangular matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRMM) + +// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void TriaLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef COPY_DIMX - #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef COPY_DIMY - #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef COPY_WPT - #define COPY_WPT 1 // Work per thread in the first dimension (x) -#endif -#ifndef COPY_VW - #define COPY_VW 1 // Vector width in the second dimension (y) -#endif - -// ================================================================================================= - -// Data-widths -#if COPY_VW == 1 - typedef real realC; -#elif COPY_VW == 2 - typedef real2 realC; -#elif COPY_VW == 4 - typedef real4 realC; -#elif COPY_VW == 8 - typedef real8 realC; -#elif COPY_VW == 16 - typedef real16 realC; -#endif - -// ================================================================================================= - -// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of -// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrix(const int ld, - __global const realC* restrict src, - __global realC* dest) { - #pragma unroll - for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Data-widths +#if COPY_VW == 1 + typedef real realC; +#elif COPY_VW == 2 + typedef real2 realC; +#elif COPY_VW == 4 + typedef real4 realC; +#elif COPY_VW == 8 + typedef real8 realC; +#elif COPY_VW == 16 + typedef real16 realC; +#endif + +// ================================================================================================= + +// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of +// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. +__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +__kernel void CopyMatrixFast(const int ld, + __global const realC* restrict src, + __global realC* dest) { + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + #endif + if (condition) { + + // Copies the value into the destination matrix. This is always within bounds of the source + // matrix, as we know that the destination matrix is smaller or equal to the source. + if (id_two < dest_two && id_one < dest_one) { + real value = src[id_two*src_ld + id_one + src_offset]; + if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl new file mode 100644 index 00000000..bf14ab12 --- /dev/null +++ b/src/kernels/level3/level3.opencl @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common functions and parameters specific for level 3 BLAS kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. + +// For the 'fast' copy kernel +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// For the padding/copy kernels and the conversion kernels +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// For the 'fast' transpose kernel +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif +#ifndef TRA_SHUFFLE + #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts +#endif + +// For the padding/transpose kernels +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl deleted file mode 100644 index eefddce4..00000000 --- a/src/kernels/level3/pad.opencl +++ /dev/null @@ -1,353 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy and pad matrices in various ways, including: -// 1) copying into a larger matrix by adding padding -// 2) copying into a smaller matrix by removing padding -// 3) from upper/lower triangle into a full matrix -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PAD_DIMX - #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef PAD_DIMY - #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef PAD_WPTX - #define PAD_WPTX 1 // Work per thread in the first dimension (x) -#endif -#ifndef PAD_WPTY - #define PAD_WPTY 1 // Work per thread in the second dimension (y) -#endif - -// ================================================================================================= - -// Copies a matrix from source to destination. The output is padded with zero values in case the -// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld -// value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void PadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one= id_one); } - else if (lower == 1) { condition = (id_two <= id_one); } - if (condition) { - - // Copies the value into the destination matrix. This is always within bounds of the source - // matrix, as we know that the destination matrix is smaller than the source. - if (id_two < dest_two && id_one < dest_one) { - real value = src[id_two*src_ld + id_one + src_offset]; - if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= -#if defined(ROUTINE_SYMM) - -// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices in various ways, including: -// 1) transposing into a larger matrix by adding padding -// 2) transposing into a smaller matrix by removing padding -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PADTRA_TILE - #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef PADTRA_WPT - #define PADTRA_WPT 1 // Amount of work per thread -#endif -#ifndef PADTRA_PAD - #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif - -// ================================================================================================= - -// Same as PadCopyMatrix, but now also does the transpose -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void PadTransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Local memory to store a tile of the matrix (for coalescing) - __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; - - // Loop over the work per thread - #pragma unroll - for (int w_one=0; w_one= id_dest_two); } - else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } - if (condition) { - - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose.opencl deleted file mode 100644 index d726f7ec..00000000 --- a/src/kernels/level3/transpose.opencl +++ /dev/null @@ -1,149 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef TRA_DIM - #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef TRA_WPT - #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other -#endif -#ifndef TRA_PAD - #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif -#ifndef TRA_SHUFFLE - #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts -#endif - -// ================================================================================================= - -// Data-widths -#if TRA_WPT == 1 - typedef real realT; -#elif TRA_WPT == 2 - typedef real2 realT; -#elif TRA_WPT == 4 - typedef real4 realT; -#elif TRA_WPT == 8 - typedef real8 realT; -#elif TRA_WPT == 16 - typedef real16 realT; -#endif - -// ================================================================================================= - -// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without -// offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrix(const int ld, - __global const realT* restrict src, - __global realT* dest) { - - // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different - // way over workgroups, breaking memory-bank dependencies. - const int gid0 = get_group_id(0); - #if TRA_SHUFFLE == 1 - const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - #else - const int gid1 = get_group_id(1); - #endif - - // Local memory to store a tile of the matrix (for coalescing) - __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; - - // Loops over the work per thread - #pragma unroll - for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// a kernel to transpose matrices. This is a 'fast' version with restrictions, see the +// 'padtranspose.opencl' file for a general transpose kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Data-widths +#if TRA_WPT == 1 + typedef real realT; +#elif TRA_WPT == 2 + typedef real2 realT; +#elif TRA_WPT == 4 + typedef real4 realT; +#elif TRA_WPT == 8 + typedef real8 realT; +#elif TRA_WPT == 16 + typedef real16 realT; +#endif + +// ================================================================================================= + +// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without +// offset. A more general version is available in 'padtranspose.opencl'. +__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +__kernel void TransposeMatrixFast(const int ld, + __global const realT* restrict src, + __global realT* dest) { + + // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different + // way over workgroups, breaking memory-bank dependencies. + const int gid0 = get_group_id(0); + #if TRA_SHUFFLE == 1 + const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + #else + const int gid1 = get_group_id(1); + #endif + + // Local memory to store a tile of the matrix (for coalescing) + __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; + + // Loops over the work per thread + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices in various ways, including: +// 1) transposing into a larger matrix by adding padding +// 2) transposing into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'transpose.opencl' file for a faster but more restricted +// transpose kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Transposes a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the transposed source matrix dimensions. +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void TransposePadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + #endif + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc index ca283b52..dee1f090 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -319,11 +319,11 @@ StatusCode Routine::PadCopyTransposeMatrix(EventPointer event, std::vector::PadCopyTransposeMatrix(EventPointer event, std::vector Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/convert_symmetric.opencl" + #include "../../kernels/level3/convert_triangular.opencl" + #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 1acba517..e83d105f 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -30,10 +30,11 @@ template Xher2k::Xher2k(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index ea1aa614..9ab50dd2 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -30,10 +30,11 @@ template Xherk::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 4f86bac5..49fbe64b 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -33,10 +33,11 @@ template Xsyr2k::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 52cb58c0..9913c7ca 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -33,10 +33,11 @@ template Xsyrk::Xsyrk(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 18cbb1c0..27ecb4ed 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -51,7 +51,7 @@ StatusCode Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; + auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared"; // Determines whether or not the triangular matrix is unit-diagonal auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc deleted file mode 100644 index 09cdecf1..00000000 --- a/src/tuning/copy.cc +++ /dev/null @@ -1,119 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneCopy { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/copy.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc new file mode 100644 index 00000000..2da707be --- /dev/null +++ b/src/tuning/copy_fast.cc @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneCopy { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "copy"; } + static std::string KernelName() { return "CopyMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc new file mode 100644 index 00000000..ec392471 --- /dev/null +++ b/src/tuning/copy_pad.cc @@ -0,0 +1,128 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePad { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "pad"; } + static std::string KernelName() { return "CopyPadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); + tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); + tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); + tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc deleted file mode 100644 index 075688db..00000000 --- a/src/tuning/pad.cc +++ /dev/null @@ -1,127 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePad { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "PadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/pad.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); - tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc deleted file mode 100644 index a970f982..00000000 --- a/src/tuning/padtranspose.cc +++ /dev/null @@ -1,131 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TunePadTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "PadTransposeMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/padtranspose.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); - tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(0); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc deleted file mode 100644 index d217a3df..00000000 --- a/src/tuning/transpose.cc +++ /dev/null @@ -1,124 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. -// -// ================================================================================================= - -#include -#include - -#include "internal/utilities.h" -#include "internal/tuning.h" - -namespace clblast { -// ================================================================================================= - -// See comment at top of file for a description of the class -template -class TuneTranspose { - public: - - // The representative kernel and the source code - static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/transpose.opencl" - ; - } - - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN}; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); - tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "TRA_PAD", {0, 1}); - tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); - } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); - } - - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } - - // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } -}; - -// ================================================================================================= -} // namespace clblast - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { - switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; - } - return 0; -} - -// ================================================================================================= diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc new file mode 100644 index 00000000..1a5260c1 --- /dev/null +++ b/src/tuning/transpose_fast.cc @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TuneTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "transpose"; } + static std::string KernelName() { return "TransposeMatrixFast"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_fast.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); + tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "TRA_PAD", {0, 1}); + tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentOutput(b_mat); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc new file mode 100644 index 00000000..08b52510 --- /dev/null +++ b/src/tuning/transpose_pad.cc @@ -0,0 +1,132 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "internal/utilities.h" +#include "internal/tuning.h" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template +class TunePadTranspose { + public: + + // The representative kernel and the source code + static std::string KernelFamily() { return "padtranspose"; } + static std::string KernelName() { return "TransposePadMatrix"; } + static std::string GetSources() { + return + #include "../src/kernels/common.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_pad.opencl" + ; + } + + // The list of arguments relevant for this routine + static std::vector GetOptions() { return {kArgM, kArgN}; } + + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + + // Sets the default values for the arguments + static size_t DefaultM() { return 1024; } + static size_t DefaultN() { return 1024; } + static size_t DefaultK() { return 1; } // N/A for this kernel + static double DefaultFraction() { return 1.0; } // N/A for this kernel + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } + static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel + static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel + + // Sets the tuning parameters and their possible values + static void SetParameters(cltune::Tuner &tuner, const size_t id) { + tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); + tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); + tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + } + + // Sets the constraints and local memory size + static void SetConstraints(cltune::Tuner &, const size_t) { } + static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { + auto LocalMemorySize = [args] (std::vector v) { + return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); + }; + tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); + } + + // Sets the base thread configuration + static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } + static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } + static std::vector LocalSize() { return {1, 1}; } + static std::vector LocalSizeRef() { return {8, 8}; } + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector>; + static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } + static TransformVector DivLocal() { return {}; } + static TransformVector MulGlobal() { return {}; } + static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } + + // Sets the kernel's arguments + static void SetArguments(cltune::Tuner &tuner, const Arguments &args, + std::vector &, std::vector &, + std::vector &a_mat, std::vector &b_mat, std::vector &, + std::vector &) { + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentInput(a_mat); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(static_cast(args.m)); + tuner.AddArgumentScalar(static_cast(args.n)); + tuner.AddArgumentScalar(0); + tuner.AddArgumentOutput(b_mat); + tuner.AddArgumentScalar(0); + } + + // Describes how to compute the performance metrics + static size_t GetMetric(const Arguments &args) { + return 2 * args.m * args.n * GetBytes(args.precision); + } + static std::string PerformanceUnit() { return "GB/s"; } +}; + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + switch(clblast::GetPrecision(argc, argv)) { + case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= -- cgit v1.2.3