diff options
Diffstat (limited to 'src/kernels/level3')
-rw-r--r-- | src/kernels/level3/convert_hermitian.opencl | 20 | ||||
-rw-r--r-- | src/kernels/level3/convert_symmetric.opencl | 20 | ||||
-rw-r--r-- | src/kernels/level3/convert_triangular.opencl | 24 | ||||
-rw-r--r-- | src/kernels/level3/copy_fast.opencl | 8 | ||||
-rw-r--r-- | src/kernels/level3/copy_pad.opencl | 34 | ||||
-rw-r--r-- | src/kernels/level3/transpose_fast.opencl | 8 | ||||
-rw-r--r-- | src/kernels/level3/transpose_pad.opencl | 34 | ||||
-rw-r--r-- | src/kernels/level3/xgemm_part2.opencl | 42 |
8 files changed, 95 insertions, 95 deletions
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl index 272905eb..ed2ded98 100644 --- a/src/kernels/level3/convert_hermitian.opencl +++ b/src/kernels/level3/convert_hermitian.opencl @@ -22,11 +22,11 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void HermLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -61,11 +61,11 @@ void HermLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void HermUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl index ea6f7dbd..8ae53b37 100644 --- a/src/kernels/level3/convert_symmetric.opencl +++ b/src/kernels/level3/convert_symmetric.opencl @@ -22,11 +22,11 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll @@ -55,11 +55,11 @@ void SymmLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void SymmUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl index 858228bb..f848dcc1 100644 --- a/src/kernels/level3/convert_triangular.opencl +++ b/src/kernels/level3/convert_triangular.opencl @@ -22,12 +22,12 @@ R"( // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void TriaLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll @@ -57,12 +57,12 @@ void TriaLowerToSquared(const int src_dim, // Same as above, but now the matrix' data is stored in the upper-triangle __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void TriaUpperToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest, - const int unit_diagonal) { + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest, + const int unit_diagonal) { // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index 54f9d987..695b9003 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -37,10 +37,10 @@ R"( // COPY_VW. Also requires both matrices to be of the same dimensions and without offset. __kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) void CopyMatrixFast(const int ld, - __global const realC* restrict src, - __global realC* dest, - const __constant real* restrict arg_alpha) { - const real alpha = arg_alpha[0]; + __global const realC* restrict src, + __global realC* dest, + const real_arg arg_alpha) { + const real alpha = GetRealArg(arg_alpha); #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { const int id_one = get_global_id(0); diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl index 92279ecf..29480b25 100644 --- a/src/kernels/level3/copy_pad.opencl +++ b/src/kernels/level3/copy_pad.opencl @@ -26,14 +26,14 @@ R"( // value and offset can be different. __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void CopyPadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const __constant real* restrict arg_alpha, - const int do_conjugate) { - const real alpha = arg_alpha[0]; + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { + const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions #pragma unroll @@ -67,15 +67,15 @@ void CopyPadMatrix(const int src_one, const int src_two, // be different. __kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) void CopyMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const __constant real* restrict arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { - const real alpha = arg_alpha[0]; + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { + const real alpha = GetRealArg(arg_alpha); // Loops over the work per thread in both dimensions #pragma unroll diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index a2007408..70156d3a 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -38,10 +38,10 @@ R"( // offset. A more general version is available in 'padtranspose.opencl'. __kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) void TransposeMatrixFast(const int ld, - __global const realT* restrict src, - __global realT* dest, - const __constant real* restrict arg_alpha) { - const real alpha = arg_alpha[0]; + __global const realT* restrict src, + __global realT* dest, + const real_arg arg_alpha) { + const real alpha = GetRealArg(arg_alpha); // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different // way over workgroups, breaking memory-bank dependencies. diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 63cc6e9a..ba0b7062 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -26,14 +26,14 @@ R"( // destination matrix dimensions are larger than the transposed source matrix dimensions. __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) void TransposePadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const __constant real* restrict arg_alpha, - const int do_conjugate) { - const real alpha = arg_alpha[0]; + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int do_conjugate) { + const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; @@ -90,15 +90,15 @@ void TransposePadMatrix(const int src_one, const int src_two, // matrix. This kernel optionally checks for upper/lower triangular matrices. __kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) void TransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const __constant real* restrict arg_alpha, - const int upper, const int lower, - const int diagonal_imag_zero) { - const real alpha = arg_alpha[0]; + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const real_arg arg_alpha, + const int upper, const int lower, + const int diagonal_imag_zero) { + const real alpha = GetRealArg(arg_alpha); // Local memory to store a tile of the matrix (for coalescing) __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl index 60e38c06..a1559b54 100644 --- a/src/kernels/level3/xgemm_part2.opencl +++ b/src/kernels/level3/xgemm_part2.opencl @@ -270,13 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the upper-triangular version. __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) void XgemmUpper(const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the upper-triangle if (GetGroupID1()*NWG < GetGroupID0()*MWG) { @@ -310,13 +310,13 @@ void XgemmUpper(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the lower-triangular version. __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) void XgemmLower(const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Skip these threads if they do not contain threads contributing to the lower-triangle if (GetGroupID1()*NWG > GetGroupID0()*MWG) { @@ -354,13 +354,13 @@ void XgemmLower(const int kSizeN, const int kSizeK, // Main entry point of the kernel. This is the regular full version. __kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const __constant real* restrict arg_alpha, - const __constant real* restrict arg_beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - const real alpha = arg_alpha[0]; - const real beta = arg_beta[0]; + const real_arg arg_alpha, + const real_arg arg_beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); // Allocates workgroup-private memory (local memory) #if SA == 1 |