diff options
21 files changed, 563 insertions, 454 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 691ca40e..66547cc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS}) # ================================================================================================== # Sets the supported routines and the used kernels. New routines and kernels should be added here. -set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv) +set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv) set(SAMPLE_PROGRAMS_CPP sgemm) set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl new file mode 100644 index 00000000..53cc161a --- /dev/null +++ b/src/kernels/level3/convert_hermitian.opencl @@ -0,0 +1,106 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains kernels to convert hermitian matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) + +// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void HermLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the lower-hermitian matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_two <= id_one) { + result = src[id_two*src_ld + id_one + src_offset]; + if (id_one == id_two) { result.y = ZERO; } + } + else { + result = src[id_one*src_ld + id_two + src_offset]; + COMPLEX_CONJUGATE(result); + } + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +// Same as above, but now the matrix' data is stored in the upper-triangle +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void HermUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the upper-hermitian matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_one <= id_two) { + result = src[id_two*src_ld + id_one + src_offset]; + if (id_one == id_two) { result.y = ZERO; } + } + else { + result = src[id_one*src_ld + id_two + src_offset]; + COMPLEX_CONJUGATE(result); + } + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl new file mode 100644 index 00000000..c6ce93ca --- /dev/null +++ b/src/kernels/level3/convert_symmetric.opencl @@ -0,0 +1,94 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains kernels to convert symmetric matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_SYMM) + +// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the lower-symmetric matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } + else { result = src[id_one*src_ld + id_two + src_offset]; } + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +// Same as above, but now the matrix' data is stored in the upper-triangle +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the upper-symmetric matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } + else { result = src[id_one*src_ld + id_two + src_offset]; } + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl new file mode 100644 index 00000000..fdd2461a --- /dev/null +++ b/src/kernels/level3/convert_triangular.opencl @@ -0,0 +1,98 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains kernels to convert triangular matrices to/from general matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +#if defined(ROUTINE_TRMM) + +// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void TriaLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the lower-triangular matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } + if (id_two == id_one && unit_diagonal) { SetToOne(result); } + // Else: result is zero + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +// Same as above, but now the matrix' data is stored in the upper-triangle +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void TriaUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_dim && id_one < dest_dim) { + + // Loads data from the upper-triangular matrix + real result; + SetToZero(result); + if (id_two < src_dim && id_one < src_dim) { + if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } + if (id_one == id_two && unit_diagonal) { SetToOne(result); } + // Else: result is zero + } + + // Stores the result in the destination matrix + dest[id_two*dest_ld + id_one + dest_offset] = result; + } + } + } +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/copy.opencl b/src/kernels/level3/copy_fast.opencl index 7dde688b..bfbfacd4 100644 --- a/src/kernels/level3/copy.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -18,23 +18,6 @@ R"( // ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef COPY_DIMX - #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef COPY_DIMY - #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef COPY_WPT - #define COPY_WPT 1 // Work per thread in the first dimension (x) -#endif -#ifndef COPY_VW - #define COPY_VW 1 // Vector width in the second dimension (y) -#endif - -// ================================================================================================= - // Data-widths #if COPY_VW == 1 typedef real realC; @@ -53,9 +36,9 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrix(const int ld, - __global const realC* restrict src, - __global realC* dest) { +__kernel void CopyMatrixFast(const int ld, + __global const realC* restrict src, + __global realC* dest) { #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { const int id_one = get_global_id(0); diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl new file mode 100644 index 00000000..f211af0f --- /dev/null +++ b/src/kernels/level3/copy_pad.opencl @@ -0,0 +1,109 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + if (id_two < dest_two && id_one < dest_one) { + + // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the + // value to be written to zero. + real value; + SetToZero(value); + if (id_two < src_two && id_one < src_one) { + value = src[id_two*src_ld + id_one + src_offset]; + } + + // Stores the value in the destination matrix + if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } +} + +// ================================================================================================= + +// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but +// writes only the actual data back to the destination matrix. Again, the ld value and offset can +// be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int upper, const int lower, + const int diagonal_imag_zero) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one<PAD_WPTX; ++w_one) { + const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); + #pragma unroll + for (int w_two=0; w_two<PAD_WPTY; ++w_two) { + const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); + + // Masking in case of triangular matrices: updates only the upper or lower part + bool condition = true; + #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + if (upper == 1) { condition = (id_two >= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + #endif + if (condition) { + + // Copies the value into the destination matrix. This is always within bounds of the source + // matrix, as we know that the destination matrix is smaller or equal to the source. + if (id_two < dest_two && id_one < dest_one) { + real value = src[id_two*src_ld + id_one + src_offset]; + if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/level3.opencl b/src/kernels/level3/level3.opencl new file mode 100644 index 00000000..bf14ab12 --- /dev/null +++ b/src/kernels/level3/level3.opencl @@ -0,0 +1,81 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains the common functions and parameters specific for level 3 BLAS kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. + +// For the 'fast' copy kernel +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// For the padding/copy kernels and the conversion kernels +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// For the 'fast' transpose kernel +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif +#ifndef TRA_SHUFFLE + #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts +#endif + +// For the padding/transpose kernels +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl deleted file mode 100644 index eefddce4..00000000 --- a/src/kernels/level3/pad.opencl +++ /dev/null @@ -1,353 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy and pad matrices in various ways, including: -// 1) copying into a larger matrix by adding padding -// 2) copying into a smaller matrix by removing padding -// 3) from upper/lower triangle into a full matrix -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PAD_DIMX - #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef PAD_DIMY - #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef PAD_WPTX - #define PAD_WPTX 1 // Work per thread in the first dimension (x) -#endif -#ifndef PAD_WPTY - #define PAD_WPTY 1 // Work per thread in the second dimension (y) -#endif - -// ================================================================================================= - -// Copies a matrix from source to destination. The output is padded with zero values in case the -// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld -// value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void PadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_two && id_one < dest_one) { - - // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the - // value to be written to zero. - real value; - SetToZero(value); - if (id_two < src_two && id_one < src_one) { - value = src[id_two*src_ld + id_one + src_offset]; - } - - // Stores the value in the destination matrix - if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; - } - } - } -} - -// ================================================================================================= - -// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but -// writes only the actual data back to the destination matrix. Again, the ld value and offset can -// be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void UnPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int upper, const int lower, - const int diagonal_imag_zero) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - - // Masking in case of triangular matrices: updates only the upper or lower part - bool condition = true; - if (upper == 1) { condition = (id_two >= id_one); } - else if (lower == 1) { condition = (id_two <= id_one); } - if (condition) { - - // Copies the value into the destination matrix. This is always within bounds of the source - // matrix, as we know that the destination matrix is smaller than the source. - if (id_two < dest_two && id_one < dest_one) { - real value = src[id_two*src_ld + id_one + src_offset]; - if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= -#if defined(ROUTINE_SYMM) - -// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the lower-symmetric matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } - else { result = src[id_one*src_ld + id_two + src_offset]; } - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -// Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the upper-symmetric matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } - else { result = src[id_one*src_ld + id_two + src_offset]; } - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -#endif -// ================================================================================================= -#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) - -// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the lower-hermitian matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_two <= id_one) { - result = src[id_two*src_ld + id_one + src_offset]; - if (id_one == id_two) { result.y = ZERO; } - } - else { - result = src[id_one*src_ld + id_two + src_offset]; - COMPLEX_CONJUGATE(result); - } - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -// Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the upper-hermitian matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_one <= id_two) { - result = src[id_two*src_ld + id_one + src_offset]; - if (id_one == id_two) { result.y = ZERO; } - } - else { - result = src[id_one*src_ld + id_two + src_offset]; - COMPLEX_CONJUGATE(result); - } - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -#endif -// ================================================================================================= -#if defined(ROUTINE_TRMM) - -// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TrmmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the lower-triangular matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; } - if (id_two == id_one && unit_diagonal) { SetToOne(result); } - // Else: result is zero - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -// Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TrmmUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one<PAD_WPTX; ++w_one) { - const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0); - #pragma unroll - for (int w_two=0; w_two<PAD_WPTY; ++w_two) { - const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1); - if (id_two < dest_dim && id_one < dest_dim) { - - // Loads data from the upper-triangular matrix - real result; - SetToZero(result); - if (id_two < src_dim && id_one < src_dim) { - if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; } - if (id_one == id_two && unit_diagonal) { SetToOne(result); } - // Else: result is zero - } - - // Stores the result in the destination matrix - dest[id_two*dest_ld + id_one + dest_offset] = result; - } - } - } -} - -#endif -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose_fast.opencl index d726f7ec..08266461 100644 --- a/src/kernels/level3/transpose.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -8,7 +8,8 @@ // Cedric Nugteren <www.cedricnugteren.nl> // // This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices. +// a kernel to transpose matrices. This is a 'fast' version with restrictions, see the +// 'padtranspose.opencl' file for a general transpose kernel. // // ================================================================================================= @@ -17,22 +18,6 @@ R"( // ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef TRA_DIM - #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef TRA_WPT - #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other -#endif -#ifndef TRA_PAD - #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif -#ifndef TRA_SHUFFLE - #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts -#endif - -// ================================================================================================= // Data-widths #if TRA_WPT == 1 @@ -52,9 +37,9 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrix(const int ld, - __global const realT* restrict src, - __global realT* dest) { +__kernel void TransposeMatrixFast(const int ld, + __global const realT* restrict src, + __global realT* dest) { // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different // way over workgroups, breaking memory-bank dependencies. diff --git a/src/kernels/level3/padtranspose.opencl b/src/kernels/level3/transpose_pad.opencl index a6b70f0b..38c23346 100644 --- a/src/kernels/level3/padtranspose.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -10,7 +10,9 @@ // This file contains the common kernels shared among different BLAS functions. This file contains // kernels to transpose matrices in various ways, including: // 1) transposing into a larger matrix by adding padding -// 2) transposing into a smaller matrix by removing padding +// 2) transposing into a smaller matrix by optionally removing padding. This is the general version +// without restrictions, see the 'transpose.opencl' file for a faster but more restricted +// transpose kernel. // // ================================================================================================= @@ -19,23 +21,11 @@ R"( // ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PADTRA_TILE - #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef PADTRA_WPT - #define PADTRA_WPT 1 // Amount of work per thread -#endif -#ifndef PADTRA_PAD - #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif -// ================================================================================================= - -// Same as PadCopyMatrix, but now also does the transpose +// Transposes a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the transposed source matrix dimensions. __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void PadTransposeMatrix(const int src_one, const int src_two, +__kernel void TransposePadMatrix(const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, const int dest_one, const int dest_two, @@ -93,16 +83,18 @@ __kernel void PadTransposeMatrix(const int src_one, const int src_two, // ================================================================================================= -// Same as UnPadCopyMatrix, but now also does the transpose +// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a +// padded source matrix, but only the actual data is written back to the transposed destination +// matrix. This kernel optionally checks for upper/lower triangular matrices. __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void UnPadTransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int upper, const int lower, - const int diagonal_imag_zero) { +__kernel void TransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int upper, const int lower, + const int diagonal_imag_zero) { // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -141,8 +133,10 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two, // Masking in case of triangular matrices: updates only the upper or lower part bool condition = true; - if (upper == 1) { condition = (id_dest_one >= id_dest_two); } - else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + if (upper == 1) { condition = (id_dest_one >= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + #endif if (condition) { // Stores the transposed value in the destination matrix diff --git a/src/routine.cc b/src/routine.cc index ca283b52..dee1f090 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -319,11 +319,11 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev IsMultiple(src_ld, db_["TRA_WPT"]) && IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) && IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) { - kernel_name = "TransposeMatrix"; + kernel_name = "TransposeMatrixFast"; } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix"; + kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix"; } } else { @@ -331,11 +331,11 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev IsMultiple(src_ld, db_["COPY_VW"]) && IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) && IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) { - kernel_name = "CopyMatrix"; + kernel_name = "CopyMatrixFast"; } else { use_fast_kernel = false; - kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix"; + kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix"; } } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index ab36076c..d08b6038 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -33,10 +33,14 @@ template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" + #include "../../kernels/level3/convert_symmetric.opencl" + #include "../../kernels/level3/convert_triangular.opencl" + #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 1acba517..e83d105f 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -30,10 +30,11 @@ template <typename T, typename U> Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index ea1aa614..9ab50dd2 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -30,10 +30,11 @@ template <typename T, typename U> Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 4f86bac5..49fbe64b 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -33,10 +33,11 @@ template <typename T> Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 52cb58c0..9913c7ca 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -33,10 +33,11 @@ template <typename T> Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name): Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/level3/copy.opencl" - #include "../../kernels/level3/pad.opencl" - #include "../../kernels/level3/transpose.opencl" - #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/level3.opencl" + #include "../../kernels/level3/copy_fast.opencl" + #include "../../kernels/level3/copy_pad.opencl" + #include "../../kernels/level3/transpose_fast.opencl" + #include "../../kernels/level3/transpose_pad.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" ; diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 18cbb1c0..27ecb4ed 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -51,7 +51,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) || (triangle == Triangle::kLower && layout == Layout::kRowMajor)); - auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared"; + auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared"; // Determines whether or not the triangular matrix is unit-diagonal auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; diff --git a/src/tuning/copy.cc b/src/tuning/copy_fast.cc index 09cdecf1..2da707be 100644 --- a/src/tuning/copy.cc +++ b/src/tuning/copy_fast.cc @@ -27,11 +27,12 @@ class TuneCopy { // The representative kernel and the source code static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrix"; } + static std::string KernelName() { return "CopyMatrixFast"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/copy.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_fast.opencl" ; } diff --git a/src/tuning/pad.cc b/src/tuning/copy_pad.cc index 075688db..ec392471 100644 --- a/src/tuning/pad.cc +++ b/src/tuning/copy_pad.cc @@ -27,11 +27,12 @@ class TunePad { // The representative kernel and the source code static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "PadMatrix"; } + static std::string KernelName() { return "CopyPadMatrix"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/pad.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/copy_pad.opencl" ; } diff --git a/src/tuning/transpose.cc b/src/tuning/transpose_fast.cc index d217a3df..1a5260c1 100644 --- a/src/tuning/transpose.cc +++ b/src/tuning/transpose_fast.cc @@ -27,11 +27,12 @@ class TuneTranspose { // The representative kernel and the source code static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrix"; } + static std::string KernelName() { return "TransposeMatrixFast"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/transpose.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_fast.opencl" ; } diff --git a/src/tuning/padtranspose.cc b/src/tuning/transpose_pad.cc index a970f982..08b52510 100644 --- a/src/tuning/padtranspose.cc +++ b/src/tuning/transpose_pad.cc @@ -27,11 +27,12 @@ class TunePadTranspose { // The representative kernel and the source code static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "PadTransposeMatrix"; } + static std::string KernelName() { return "TransposePadMatrix"; } static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/padtranspose.opencl" + #include "../src/kernels/level3/level3.opencl" + #include "../src/kernels/level3/transpose_pad.opencl" ; } |