diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-08-20 12:50:31 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-08-20 12:50:31 +0200 |
commit | 6eca53ee235e8105a2822f725beb23d4ab7a7bd6 (patch) | |
tree | 03c49539feb9b34d149714a49a2f35292cf7f29a /src/kernels/level3 | |
parent | 1ec21421d7dedd715badbc7cf8f555d3b76a022b (diff) | |
parent | 57f1aa76857cf0566e05b43b9b2a98a3a6139c8b (diff) |
Merge branch 'master' of https://github.com/dvasschemacq/CLBlast into dvasschemacq-master
Conflicts:
src/kernels/level1/xaxpy.opencl
src/kernels/level2/xgemv.opencl
src/kernels/level2/xgemv_fast.opencl
src/kernels/level2/xger.opencl
src/kernels/level2/xher.opencl
src/kernels/level2/xher2.opencl
src/kernels/level3/xgemm_part2.opencl
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/convert_hermitian.opencl | 28 | ||||
-rw-r--r-- | src/kernels/level3/convert_symmetric.opencl | 28 | ||||
-rw-r--r-- | src/kernels/level3/convert_triangular.opencl | 32 | ||||
-rw-r--r-- | src/kernels/level3/copy_fast.opencl | 10 | ||||
-rw-r--r-- | src/kernels/level3/copy_pad.opencl | 38 | ||||
-rw-r--r-- | src/kernels/level3/transpose_fast.opencl | 10 | ||||
-rw-r--r-- | src/kernels/level3/transpose_pad.opencl | 38 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part2.opencl | 42 |
8 files changed, 113 insertions, 113 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 53cc161a..ed2ded98 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -20,13 +20,13 @@ R"( // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void HermUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void HermUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index c6ce93ca..8ae53b37 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -20,13 +20,13 @@ R"( // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void SymmUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index fdd2461a..f848dcc1 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -20,14 +20,14 @@ R"( // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll @@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim, } // Same as above, but now the matrix' data is stored in the upper-triangle -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void TriaUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void TriaUpperToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index dd975bf1..695b9003 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -35,11 +35,11 @@ R"( // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrixFast(const int ld, - __global const realC* restrict src, - __global realC* dest, - const real_arg arg_alpha) { +__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +void CopyMatrixFast(const int ld, + __global const realC* restrict src, + __global realC* dest, + const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index d0771c31..29480b25 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -24,15 +24,15 @@ R"( // Copies a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld // value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyPadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions @@ -65,16 +65,16 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two, // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but // writes only the actual data back to the destination matrix. Again, the ld value and offset can // be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +void CopyMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index ea343533..70156d3a 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -36,11 +36,11 @@ R"( // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without // offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrixFast(const int ld, - __global const realT* restrict src, - __global realT* dest, - const real_arg arg_alpha) { +__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +void TransposeMatrixFast(const int ld, + __global const realT* restrict src, + __global realT* dest, + const real_arg arg_alpha) { const real alpha = GetRealArg(arg_alpha); // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 2e20d667..ba0b7062 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,15 +24,15 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposePadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int do_conjugate) { +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposePadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) @@ -88,16 +88,16 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void TransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const real_arg arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { +__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +void TransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 87e28cb5..a1559b54 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -268,13 +268,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) // Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmUpper(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -308,13 +308,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK, } // Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void XgemmLower(const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); @@ -352,13 +352,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK, #else // Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real_arg arg_alpha, - const real_arg arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { +__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); |