diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-03-10 21:15:29 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-03-10 21:15:29 +0100 |
commit | de3500ed18ddb39261ffa270f460909571276462 (patch) | |
tree | b515368fcd1e39afb5805f67796b082ccc8066f9 | |
parent | 37228c90988509acef9e8a892a752300b7645210 (diff) | |
parent | 3846f44eaf389ee24a698d4947e5c16bd14c3d0e (diff) |
Merge pull request #141 from CNugteren/axpy_batched
Added the batched version of the AXPY routine
28 files changed, 960 insertions, 182 deletions
@@ -7,13 +7,14 @@ Development version (next release) - Fixed bugs in the half-precision routines HTBMV/HTPMV/HTRMV/HSYR2K/HTRMM - Tests now also exit with an error code when OpenCL errors or compilation errors occur - Tests now also check for the L2 error in case of half-precision -- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) -- Added level-2 routines: +- Added the OverrideParameters function to the API to be able to supply custom tuning parmeters +- Added triangular solver (level-2 & level-3) routines: * STRSV/DTRSV/CTRSV/ZTRSV (experimental, un-optimized) -- Added level-3 routines: * STRSM/DTRSM/CTRSM/ZTRSM (experimental, un-optimized) +- Added batched (non-BLAS) routines: + * SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED (batched version of AXPY) Version 0.10.0 - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header diff --git a/CMakeLists.txt b/CMakeLists.txt index bf905bc8..ef6156dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,7 +159,7 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm xtrsm) -set(LEVELX_ROUTINES xomatcopy) +set(LEVELX_ROUTINES xomatcopy xaxpybatched) set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES}) set(PRECISIONS 32 64 3232 6464 16) diff --git a/doc/clblast.md b/doc/clblast.md index 1d7c0df2..120c0c2c 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -2903,6 +2903,72 @@ Requirements for OMATCOPY: +xAXPYBATCHED: Batched version of AXPY +------------- + +As AXPY, but multiple operations are batched together for better performance. + +C++ API: +``` +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +``` + +C API: +``` +CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, + const float *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, + const double *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, + const cl_float2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, + const cl_double2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, + const cl_half *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) +``` + +Arguments to AXPYBATCHED: + +* `const size_t n`: Integer size argument. This value must be positive. +* `const T *alphas`: Input scalar constants. +* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector. +* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector. +* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0. +* `cl_mem y_buffer`: OpenCL buffer to store the output y vector. +* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector. +* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0. +* `const size_t batch_count`: Number of batches. This value must be positive. +* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on. +* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument. + + + ClearCache: Resets the cache of compiled binaries (auxiliary function) ------------- diff --git a/include/clblast.h b/include/clblast.h index 020f8e79..a1f14471 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -97,6 +97,7 @@ enum class StatusCode { kInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast + kInvalidBatchCount = -2049, // The batch count needs to be positive kInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel kMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel kInvalidLocalMemUsage = -2046, // Not enough local memory available on this device @@ -609,6 +610,15 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event = nullptr); +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event = nullptr); + // ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on diff --git a/include/clblast_c.h b/include/clblast_c.h index 12d03f81..4f21ba17 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -96,6 +96,7 @@ typedef enum CLBlastStatusCode_ { CLBlastInsufficientMemoryY = -1007, // Vector Y's OpenCL buffer is too small // Custom additional status codes for CLBlast + CLBlastInvalidBatchCount = -2049, // The batch count needs to be positive CLBlastInvalidOverrideKernel = -2048, // Trying to override parameters for an invalid kernel CLBlastMissingOverrideParameter = -2047, // Missing override parameter(s) for the target kernel CLBlastInvalidLocalMemUsage = -2046, // Not enough local memory available on this device @@ -1327,6 +1328,38 @@ CLBlastStatusCode PUBLIC_API CLBlastHomatcopy(const CLBlastLayout layout, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event); +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +CLBlastStatusCode PUBLIC_API CLBlastSaxpyBatched(const size_t n, + const float *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastDaxpyBatched(const size_t n, + const double *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastCaxpyBatched(const size_t n, + const cl_float2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastZaxpyBatched(const size_t n, + const cl_double2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event); +CLBlastStatusCode PUBLIC_API CLBlastHaxpyBatched(const size_t n, + const cl_half *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event); + // ================================================================================================= // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 09c743bb..8dd5fc0c 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [121, 75, 125, 23, 29, 41, 65, 32] +HEADER_LINES = [122, 76, 126, 23, 29, 41, 65, 32] FOOTER_LINES = [25, 138, 27, 38, 6, 6, 9, 2] HEADER_LINES_DOC = 0 FOOTER_LINES_DOC = 63 @@ -101,65 +101,68 @@ bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft # Populates a list of routines ROUTINES = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []), + Routine(False, True, False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, False, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, False, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, False, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, False, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, False, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, False, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, False, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, False, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(True, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, False, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, False, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, False, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, False, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, False, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, False, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, False, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, False, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, False, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(True, True, False, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, False, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, False, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, False, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, False, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, False, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, False, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, False, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, False, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, False, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, False, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(True, True, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []), + Routine(True, True, False, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, False, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, False, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, False, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, False, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, False, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, False, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, False, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(True, True, False, "3", "trsm", T, [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + # Special routines: + Routine(True, True, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + # Batched routines: + Routine(True, True, True, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []), ]] @@ -207,9 +210,11 @@ def main(argv): if i == 5: body += cpp.wrapper_cblas(routine) if i == 6: - body += cpp.clblast_netlib_c_h(routine) + if not routine.batched: + body += cpp.clblast_netlib_c_h(routine) if i == 7: - body += cpp.clblast_netlib_c_cc(routine) + if not routine.batched: + body += cpp.clblast_netlib_c_cc(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) @@ -219,7 +224,7 @@ def main(argv): for routine in ROUTINES[level - 1]: if routine.has_tests: level_string = cpp.LEVEL_NAMES[level - 1] - routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp" + routine_suffix = "level" + level_string + "/x" + routine.lowercase_name() + ".cpp" # Correctness tests filename = library_root + "/test/correctness/routines/" + routine_suffix diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index c14d00a1..91fdf458 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -51,8 +51,10 @@ def clblast_cc(routine): result += routine.routine_header_cpp(12, "") + " {" + NL result += " try {" + NL result += " auto queue_cpp = Queue(*queue);" + NL - result += " auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL - result += " routine.Do" + routine.name.capitalize() + "(" + result += " auto routine = X" + routine.plain_name() + "<" + routine.template.template + ">(queue_cpp, event);" + NL + if routine.batched: + result += " " + (NL + " ").join(routine.batched_transform_to_cpp()) + NL + result += " routine.Do" + routine.capitalized_name() + "(" result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()]) result += ");" + NL result += " return StatusCode::kSuccess;" + NL @@ -63,7 +65,7 @@ def clblast_cc(routine): result += "}" + NL for flavour in routine.flavours: indent2 = " " * (34 + routine.length() + len(flavour.template)) - result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">(" + result += "template StatusCode PUBLIC_API " + routine.capitalized_name() + "<" + flavour.template + ">(" result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)]) result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL return result @@ -84,9 +86,11 @@ def clblast_c_cc(routine): template = "<" + flavour.template + ">" if routine.no_scalars() else "" indent = " " * (16 + routine.length() + len(template)) result += routine.routine_header_c(flavour, 27, "") + " {" + NL + if routine.batched: + result += " " + (NL + " ").join(routine.batched_transform_to_complex(flavour)) + NL result += " try {" + NL result += " return static_cast<CLBlastStatusCode>(" + NL - result += " clblast::" + routine.name.capitalize() + template + "(" + result += " clblast::" + routine.capitalized_name() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) result += "," + NL + indent + "queue, event)" + NL result += " );" + NL @@ -290,7 +294,7 @@ def performance_test(routine, level_string): """Generates the body of a performance test for a specific routine""" result = "" result += "#include \"test/performance/client.hpp\"" + NL - result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL + result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL result += "// Shortcuts to the clblast namespace" + NL result += "using float2 = clblast::float2;" + NL result += "using double2 = clblast::double2;" + NL + NL @@ -304,7 +308,7 @@ def performance_test(routine, level_string): found = False for flavour in routine.flavours: if flavour.precision_name == precision: - result += NL + " clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template() + result += NL + " clblast::RunClient<clblast::TestX" + routine.plain_name() + flavour.test_template() result += ">(argc, argv); break;" + NL found = True if not found: @@ -319,7 +323,7 @@ def correctness_test(routine, level_string): """Generates the body of a correctness test for a specific routine""" result = "" result += "#include \"test/correctness/testblas.hpp\"" + NL - result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL + result += "#include \"test/routines/level" + level_string + "/x" + routine.lowercase_name() + ".hpp\"" + NL + NL result += "// Shortcuts to the clblast namespace" + NL result += "using float2 = clblast::float2;" + NL result += "using double2 = clblast::double2;" + NL + NL @@ -328,8 +332,8 @@ def correctness_test(routine, level_string): result += " auto errors = size_t{0};" + NL not_first = "false" for flavour in routine.flavours: - result += " errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template() - result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL + result += " errors += clblast::RunTests<clblast::TestX" + routine.plain_name() + flavour.test_template() + result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.upper_name() + "\");" + NL not_first = "true" result += " if (errors > 0) { return 1; } else { return 0; }" + NL result += "}" + NL diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 98874174..cfdbf748 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -30,17 +30,17 @@ class DataType: self.beta_cl = scalars[3] self.buffer_type = buffer_type - def use_alpha(self): + def use_alpha(self, postfix=""): """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type""" if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]: - return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}" - return "alpha" + return self.alpha_cpp + "{alpha" + postfix + ".s[0], alpha" + postfix + ".s[1]}" + return "alpha" + postfix - def use_beta(self): + def use_beta(self, postfix=""): """As above, but for beta instead of alpha""" if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]: - return self.beta_cpp + "{beta.s[0], beta.s[1]}" - return "beta" + return self.beta_cpp + "{beta" + postfix + ".s[0], beta" + postfix + ".s[1]}" + return "beta" + postfix def use_alpha_opencl(self): """As above, but the transformation is in the opposite direction""" diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py index c77ec1a0..9c73ffbc 100644 --- a/scripts/generator/generator/doc.py +++ b/scripts/generator/generator/doc.py @@ -20,7 +20,7 @@ def generate(routine): result = "" # Routine header - result += "x" + routine.name.upper() + ": " + routine.description + NL + result += "x" + routine.upper_name() + ": " + routine.description + NL result += "-------------" + NL + NL result += routine.details + NL + NL @@ -36,7 +36,7 @@ def generate(routine): result += "```" + NL + NL # Routine arguments - result += "Arguments to " + routine.name.upper() + ":" + NL + NL + result += "Arguments to " + routine.upper_name() + ":" + NL + NL for argument in routine.arguments_doc(): result += "* " + argument + NL result += "* `cl_command_queue* queue`: " @@ -47,7 +47,7 @@ def generate(routine): # Routine requirements if len(routine.requirements_doc()) > 0: - result += "Requirements for " + routine.name.upper() + ":" + NL + NL + result += "Requirements for " + routine.upper_name() + ":" + NL + NL for requirement in routine.requirements_doc(): result += "* " + requirement + NL result += NL diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 6fcce23b..59b2ed73 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -12,11 +12,12 @@ import generator.convert as convert class Routine: """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" - def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, + def __init__(self, implemented, has_tests, batched, level, name, template, flavours, sizes, options, inputs, outputs, buffer_sizes, scalars, scratch, description, details, requirements): self.implemented = implemented self.has_tests = has_tests + self.batched = batched self.level = level self.name = name self.template = template @@ -32,6 +33,69 @@ class Routine: self.details = details self.requirements = requirements + def lowercase_name(self): + postfix = "batched" if self.batched else "" + return self.name + postfix + + def plain_name(self): + postfix = "Batched" if self.batched else "" + return self.name + postfix + + def capitalized_name(self): + postfix = "Batched" if self.batched else "" + return self.name.capitalize() + postfix + + def upper_name(self): + postfix = "BATCHED" if self.batched else "" + return self.name.upper() + postfix + + def b_star(self): + return "*" if self.batched else "" + + def b_s(self): + return "s" if self.batched else "" + + def batch_count_def(self): + return ["const size_t batch_count"] if self.batched else [] + + def batch_count_list(self): + return ["batch_count"] if self.batched else [] + + def batch_count_type(self): + return ["const size_t"] if self.batched else [] + + def batch_count_doc(self): + return ["`const size_t batch_count`: Number of batches. This value must be positive."] if self.batched else [] + + def batched_transform_to_cpp(self): + result = [] + for scalar in self.scalars: + result.append("auto " + scalar + "s_cpp = std::vector<T>();") + for buffer_name in self.inputs + self.outputs: + result.append("auto " + buffer_name + "_offsets_cpp = std::vector<size_t>();") + result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {") + for scalar in self.scalars: + result.append(" " + scalar + "s_cpp.push_back(" + scalar + "s[batch]);") + for buffer_name in self.inputs + self.outputs: + result.append(" " + buffer_name + "_offsets_cpp.push_back(" + buffer_name + "_offsets[batch]);") + result.append("}") + return result + + def batched_transform_to_complex(self, flavour): + result = [] + for scalar in self.scalars: + result.append("auto " + scalar + "s_cpp = std::vector<" + flavour.buffer_type + ">();") + result.append("for (auto batch = size_t{0}; batch < batch_count; ++batch) {") + for scalar in self.scalars: + content = scalar + if scalar == "alpha": + content = flavour.use_alpha(postfix="s[batch]") + elif scalar == "beta": + content = flavour.use_beta(postfix="s[batch]") + result.append(" " + scalar + "s_cpp.push_back(" + content + ");") + result.append("}") + return result + @staticmethod def scalar_buffers_first(): """List of scalar buffers""" @@ -127,7 +191,7 @@ class Routine: def length(self): """Retrieves the number of characters in the routine's name""" - return len(self.name) + return len(self.capitalized_name()) def no_scalars(self): """Determines whether or not this routine has scalar arguments (alpha/beta)""" @@ -135,13 +199,13 @@ class Routine: def short_names(self): """Returns the upper-case names of these routines (all flavours)""" - return "/".join([f.name + self.name.upper() for f in self.flavours]) + return "/".join([f.name + self.upper_name() for f in self.flavours]) def short_names_tested(self): """As above, but excludes some""" - names = [f.name + self.name.upper() for f in self.flavours] - if "H" + self.name.upper() in names: - names.remove("H" + self.name.upper()) + names = [f.name + self.upper_name() for f in self.flavours] + if "H" + self.upper_name() in names: + names.remove("H" + self.upper_name()) return "/".join(names) def buffers_first(self): @@ -159,7 +223,7 @@ class Routine: """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')""" if name in self.inputs or name in self.outputs: a = [name + "_buffer"] - b = [name + "_offset"] + b = [name + "_offset" + self.b_s()] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -187,7 +251,7 @@ class Routine: prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: a = [prefix + "cl_mem " + name + "_buffer"] - b = ["const size_t " + name + "_offset"] + b = ["const size_t " + self.b_star() + name + "_offset" + self.b_s()] c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + b + c)] return [] @@ -228,7 +292,7 @@ class Routine: if name in self.inputs or name in self.outputs: buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"] - b = [name + "_offset"] + b = [name + "_offsets_cpp"] if self.batched else [name + "_offset"] c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -270,7 +334,7 @@ class Routine: prefix = "const " if (name in self.inputs) else "" if (name in self.inputs) or (name in self.outputs): a = [prefix + "cl_mem"] - b = ["const size_t"] + b = ["const size_t" + self.b_star()] c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else [] return [", ".join(a + b + c)] return [] @@ -283,18 +347,19 @@ class Routine: math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector" inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment " a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."] - b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."] + b = ["`const size_t " + self.b_star() + name + "_offset" + self.b_s() + "`: The offset" + self.b_s() + " in elements from the start of the " + inout + " " + math_name + "."] + c = [] if name not in self.buffers_without_ld_inc(): c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " + inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."] - else: - c = [] return a + b + c return [] def scalar(self, name): """Retrieves the name of a scalar (alpha/beta)""" if name in self.scalars: + if self.batched: + return [name + "s_cpp"] return [name] return [] @@ -314,8 +379,12 @@ class Routine: """Retrieves the use of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": + if self.batched: + return ["alphas_cpp.data()"] return [flavour.use_alpha()] elif name == "beta": + if self.batched: + return ["betas_cpp.data()"] return [flavour.use_beta()] return [name] return [] @@ -342,16 +411,16 @@ class Routine: """Retrieves the definition of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": - return ["const " + flavour.alpha_cl + " " + name] - return ["const " + flavour.beta_cl + " " + name] + return ["const " + flavour.alpha_cl + " " + self.b_star() + name + self.b_s()] + return ["const " + flavour.beta_cl + " " + self.b_star() + name + self.b_s()] return [] def scalar_def_plain(self, name, flavour): """As above, but without 'cl_' prefix""" if name in self.scalars: if name == "alpha": - return ["const " + flavour.alpha_cpp + " " + name] - return ["const " + flavour.beta_cpp + " " + name] + return ["const " + flavour.alpha_cpp + " " + self.b_star() + name + self.b_s()] + return ["const " + flavour.beta_cpp + " " + self.b_star() + name + self.b_s()] return [] def scalar_def_void(self, name, flavour): @@ -368,16 +437,16 @@ class Routine: """Retrieves the type of a scalar (alpha/beta)""" if name in self.scalars: if name == "alpha": - return ["const " + flavour.alpha_cpp] - return ["const " + flavour.beta_cpp] + return ["const " + flavour.alpha_cpp + self.b_star()] + return ["const " + flavour.beta_cpp + self.b_star()] return [] def scalar_doc(self, name): """Retrieves the documentation of a scalar""" if name in self.scalars: if name == "alpha": - return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."] - return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] + return ["`const " + self.template.alpha_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."] + return ["`const " + self.template.beta_cpp + " " + self.b_star() + name + self.b_s() + "`: Input scalar constant" + self.b_s() + "."] return [] def scalar_create_cpp(self, flavour): @@ -507,7 +576,8 @@ class Routine: self.scalar("beta") + list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + list(chain(*[self.scalar(s) for s in self.other_scalars()])) + + self.batch_count_list()) def arguments_cast(self, flavour, indent): """As above, but with CLBlast casts""" @@ -518,7 +588,8 @@ class Routine: self.scalar_use("beta", flavour) + list(chain(*[self.buffer(b) for b in self.buffers_second()])) + list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])) + + self.batch_count_list()) def arguments_netlib(self, flavour, indent): """As above, but for the Netlib CBLAS API""" @@ -561,7 +632,8 @@ class Routine: self.scalar_def("beta", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) + + self.batch_count_def()) def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" @@ -574,6 +646,7 @@ class Routine: list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) if self.name in self.routines_scalar_no_return(): result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + result += self.batch_count_def() return result def arguments_def_c(self, flavour): @@ -585,7 +658,8 @@ class Routine: self.scalar_def("beta", flavour) + list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])) + + self.batch_count_def()) def arguments_def_wrapper_clblas(self, flavour): """As above, but clBLAS wrapper plain data-types""" @@ -618,7 +692,8 @@ class Routine: self.scalar_type("beta", flavour) + list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()]))) + list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])) + + self.batch_count_type()) def arguments_doc(self): """Retrieves a combination of all the argument types""" @@ -630,7 +705,8 @@ class Routine: self.scalar_doc("beta") + list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) + list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) + - list(chain(*[self.scalar_doc(s) for s in self.other_scalars()]))) + list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])) + + self.batch_count_doc()) def requirements_doc(self): """Retrieves a list of routine requirements for documentation""" @@ -640,7 +716,7 @@ class Routine: """Retrieves the C++ templated definition for a routine""" indent = " " * (spaces + self.length()) result = "template <" + self.template.name + ">\n" - result += "StatusCode " + self.name.capitalize() + "(" + result += "StatusCode " + self.capitalized_name() + "(" result += (",\n" + indent).join([a for a in self.arguments_def(self.template)]) result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")" return result @@ -649,7 +725,7 @@ class Routine: """As above, but now without variable names""" indent = " " * (spaces + self.length()) result = "template <" + self.template.name + ">\n" - result += "StatusCode " + self.name.capitalize() + "(" + result += "StatusCode " + self.capitalized_name() + "(" result += (",\n" + indent).join([a for a in self.arguments_type(self.template)]) result += ",\n" + indent + "cl_command_queue*, cl_event*)" return result @@ -657,7 +733,7 @@ class Routine: def routine_header_c(self, flavour, spaces, extra_qualifier): """As above, but now for C""" indent = " " * (spaces + self.length()) - result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "(" + result = "CLBlastStatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.plain_name() + "(" result += (",\n" + indent).join([a for a in self.arguments_def_c(flavour)]) result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" return result @@ -677,6 +753,8 @@ class Routine: if self.name in self.routines_scalar_no_return(): routine_name += "_sub" indent += " " + if self.batched: + routine_name += "batched" result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast.cpp b/src/clblast.cpp index a63d766c..d3db8edf 100644 --- a/src/clblast.cpp +++ b/src/clblast.cpp @@ -71,6 +71,7 @@ // Level-x includes (non-BLAS) #include "routines/levelx/xomatcopy.hpp" +#include "routines/levelx/xaxpybatched.hpp" namespace clblast { @@ -2172,6 +2173,64 @@ template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); + +// Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED +template <typename T> +StatusCode AxpyBatched(const size_t n, + const T *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + try { + auto queue_cpp = Queue(*queue); + auto routine = XaxpyBatched<T>(queue_cpp, event); + auto alphas_cpp = std::vector<T>(); + auto x_offsets_cpp = std::vector<size_t>(); + auto y_offsets_cpp = std::vector<size_t>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + x_offsets_cpp.push_back(x_offsets[batch]); + y_offsets_cpp.push_back(y_offsets[batch]); + } + routine.DoAxpyBatched(n, + alphas_cpp, + Buffer<T>(x_buffer), x_offsets_cpp, x_inc, + Buffer<T>(y_buffer), y_offsets_cpp, y_inc, + batch_count); + return StatusCode::kSuccess; + } catch (...) { return DispatchException(); } +} +template StatusCode PUBLIC_API AxpyBatched<float>(const size_t, + const float*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<double>(const size_t, + const double*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<float2>(const size_t, + const float2*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<double2>(const size_t, + const double2*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API AxpyBatched<half>(const size_t, + const half*, + const cl_mem, const size_t*, const size_t, + cl_mem, const size_t*, const size_t, + const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // Clears the cache of stored binaries diff --git a/src/clblast_c.cpp b/src/clblast_c.cpp index 6018bcfa..b09f8c54 100644 --- a/src/clblast_c.cpp +++ b/src/clblast_c.cpp @@ -3447,6 +3447,113 @@ CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTran } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } } +// AXPY +CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, + const float *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<float>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, + const double *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<double>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, + const cl_float2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<float2>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(float2{alphas[batch].s[0], alphas[batch].s[1]}); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, + const cl_double2 *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<double2>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(double2{alphas[batch].s[0], alphas[batch].s[1]}); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} +CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, + const cl_half *alphas, + const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, + cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, + const size_t batch_count, + cl_command_queue* queue, cl_event* event) { + auto alphas_cpp = std::vector<half>(); + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + alphas_cpp.push_back(alphas[batch]); + } + try { + return static_cast<CLBlastStatusCode>( + clblast::AxpyBatched(n, + alphas_cpp.data(), + x_buffer, x_offsets, x_inc, + y_buffer, y_offsets, y_inc, + batch_count, + queue, event) + ); + } catch (...) { return static_cast<CLBlastStatusCode>(clblast::DispatchExceptionForC()); } +} + // ================================================================================================= // Clears the cache of stored binaries diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 41af28da..29f81cf8 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -600,9 +600,6 @@ class Buffer { // Copies from host to device: writing the device buffer a-synchronously void WriteAsync(const Queue &queue, const size_t size, const T* host, const size_t offset = 0) { - if (access_ == BufferAccess::kReadOnly) { - throw LogicError("Buffer: writing to a read-only buffer"); - } if (GetSize() < (offset+size)*sizeof(T)) { throw LogicError("Buffer: target device buffer is too small"); } diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl index ece8476e..f44bbce0 100644 --- a/src/kernels/level1/xaxpy.opencl +++ b/src/kernels/level1/xaxpy.opencl @@ -9,7 +9,7 @@ // // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't -// support vector data-types. +// support vector data-types. The general version has a batched implementation as well. // // This kernel uses the level-1 BLAS common tuning parameters. // @@ -36,8 +36,6 @@ void Xaxpy(const int n, const real_arg arg_alpha, } } -// ================================================================================================= - // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is // dividable by 'VW', 'WGS' and 'WPT'. __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) @@ -57,6 +55,24 @@ void XaxpyFast(const int n, const real_arg arg_alpha, // ================================================================================================= +// Full version of the kernel with offsets and strided accesses: batched version +__kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) +void XaxpyBatched(const int n, const __constant real_arg* arg_alphas, + const __global real* restrict xgm, const __constant int* x_offsets, const int x_inc, + __global real* ygm, const __constant int* y_offsets, const int y_inc) { + const int batch = get_group_id(1); + const real alpha = GetRealArg(arg_alphas[batch]); + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id<n; id += get_global_size(0)) { + real xvalue = xgm[id*x_inc + x_offsets[batch]]; + MultiplyAdd(ygm[id*y_inc + y_offsets[batch]], alpha, xvalue); + } +} + +// ================================================================================================= + // End of the C++11 raw string literal )" diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp new file mode 100644 index 00000000..6a4269be --- /dev/null +++ b/src/routines/levelx/xaxpybatched.cpp @@ -0,0 +1,95 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the XaxpyBatched class (see the header for information about the class). +// +// ================================================================================================= + +#include "routines/levelx/xaxpybatched.hpp" + +#include <string> +#include <vector> + +namespace clblast { +// ================================================================================================= + +// Constructor: forwards to base class constructor +template <typename T> +XaxpyBatched<T>::XaxpyBatched(Queue &queue, EventPointer event, const std::string &name): + Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>(), {}, { + #include "../../kernels/level1/level1.opencl" + #include "../../kernels/level1/xaxpy.opencl" + }) { +} + +// ================================================================================================= + +// The main routine +template <typename T> +void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, + const size_t batch_count) { + + // Tests for a valid batch count + if ((batch_count < 1) || (alphas.size() != batch_count) || + (x_offsets.size() != batch_count) || (y_offsets.size() != batch_count)) { + throw BLASError(StatusCode::kInvalidBatchCount); + } + + // Makes sure all dimensions are larger than zero + if (n == 0) { throw BLASError(StatusCode::kInvalidDimension); } + + // Tests the vectors for validity + for (auto batch = size_t{0}; batch < batch_count; ++batch) { + TestVectorX(n, x_buffer, x_offsets[batch], x_inc); + TestVectorY(n, y_buffer, y_offsets[batch], y_inc); + } + + // Upload the arguments to the device + std::vector<int> x_offsets_int(x_offsets.begin(), x_offsets.end()); + std::vector<int> y_offsets_int(y_offsets.begin(), y_offsets.end()); + auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + x_offsets_device.Write(queue_, batch_count, x_offsets_int); + y_offsets_device.Write(queue_, batch_count, y_offsets_int); + alphas_device.Write(queue_, batch_count, alphas); + + // Retrieves the Xaxpy kernel from the compiled binary + auto kernel = Kernel(program_, "XaxpyBatched"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast<int>(n)); + kernel.SetArgument(1, alphas_device()); + kernel.SetArgument(2, x_buffer()); + kernel.SetArgument(3, x_offsets_device()); + kernel.SetArgument(4, static_cast<int>(x_inc)); + kernel.SetArgument(5, y_buffer()); + kernel.SetArgument(6, y_offsets_device()); + kernel.SetArgument(7, static_cast<int>(y_inc)); + + // Launches the kernel + auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]); + auto global = std::vector<size_t>{n_ceiled/db_["WPT"], batch_count}; + auto local = std::vector<size_t>{db_["WGS"], 1}; + RunKernel(kernel, queue_, device_, global, local, event_); +} + +// ================================================================================================= + +// Compiles the templated class +template class XaxpyBatched<half>; +template class XaxpyBatched<float>; +template class XaxpyBatched<double>; +template class XaxpyBatched<float2>; +template class XaxpyBatched<double2>; + +// ================================================================================================= +} // namespace clblast diff --git a/src/routines/levelx/xaxpybatched.hpp b/src/routines/levelx/xaxpybatched.hpp new file mode 100644 index 00000000..513792ea --- /dev/null +++ b/src/routines/levelx/xaxpybatched.hpp @@ -0,0 +1,43 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements the XaxpyBatched routine. This is a non-blas batched version of AXPY. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_XAXPYBATCHED_H_ +#define CLBLAST_ROUTINES_XAXPYBATCHED_H_ + +#include <vector> + +#include "routine.hpp" + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class XaxpyBatched: public Routine { + public: + + // Constructor + XaxpyBatched(Queue &queue, EventPointer event, const std::string &name = "AXPYBATCHED"); + + // Templated-precision implementation of the routine + void DoAxpyBatched(const size_t n, const std::vector<T> &alphas, + const Buffer<T> &x_buffer, const std::vector<size_t> &x_offsets, const size_t x_inc, + const Buffer<T> &y_buffer, const std::vector<size_t> &y_offsets, const size_t y_inc, + const size_t batch_count); +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_ROUTINES_XAXPYBATCHED_H_ +#endif diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 1dd76894..7060fc9f 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -17,6 +17,7 @@ #include <vector> #include <string> +#include <random> #include <cltune.h> @@ -77,12 +78,14 @@ void Tuner(int argc, char* argv[]) { auto b_mat = std::vector<T>(C::GetSizeB(args)); auto c_mat = std::vector<T>(C::GetSizeC(args)); auto temp = std::vector<T>(C::GetSizeTemp(args)); - PopulateVector(x_vec, kSeed); - PopulateVector(y_vec, kSeed); - PopulateVector(a_mat, kSeed); - PopulateVector(b_mat, kSeed); - PopulateVector(c_mat, kSeed); - PopulateVector(temp, kSeed); + std::mt19937 mt(kSeed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + PopulateVector(x_vec, mt, dist); + PopulateVector(y_vec, mt, dist); + PopulateVector(a_mat, mt, dist); + PopulateVector(b_mat, mt, dist); + PopulateVector(c_mat, mt, dist); + PopulateVector(temp, mt, dist); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index d68cc1a6..3d091b64 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -67,8 +67,8 @@ template <> double2 Constant(const double val) { return {val, 0.0}; } template <typename T> T SmallConstant() { return static_cast<T>(1e-4); } template float SmallConstant<float>(); template double SmallConstant<double>(); -template <> half SmallConstant() { return FloatToHalf(1e-4); } -template <> float2 SmallConstant() { return {1e-4, 0.0f}; } +template <> half SmallConstant() { return FloatToHalf(1e-4f); } +template <> float2 SmallConstant() { return {1e-4f, 0.0f}; } template <> double2 SmallConstant() { return {1e-4, 0.0}; } // Returns the absolute value of a scalar (modulus in case of a complex number) @@ -326,42 +326,29 @@ unsigned int GetRandomSeed() { // Create a random number generator and populates a vector with samples from a random distribution template <typename T> -void PopulateVector(std::vector<T> &vector, const unsigned int seed) { - auto lower_limit = static_cast<T>(kTestDataLowerLimit); - auto upper_limit = static_cast<T>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<T> dist(lower_limit, upper_limit); - for (auto &element: vector) { element = dist(mt); } +void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { element = static_cast<T>(dist(mt)); } } -template void PopulateVector<float>(std::vector<float>&, const unsigned int); -template void PopulateVector<double>(std::vector<double>&, const unsigned int); +template void PopulateVector<float>(std::vector<float>&, std::mt19937&, std::uniform_real_distribution<double>&); +template void PopulateVector<double>(std::vector<double>&, std::mt19937&, std::uniform_real_distribution<double>&); // Specialized versions of the above for complex data-types template <> -void PopulateVector(std::vector<float2> &vector, const unsigned int seed) { - auto lower_limit = static_cast<float>(kTestDataLowerLimit); - auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<float> dist(lower_limit, upper_limit); - for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } +void PopulateVector(std::vector<float2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { + element.real(static_cast<float>(dist(mt))); + element.imag(static_cast<float>(dist(mt))); + } } template <> -void PopulateVector(std::vector<double2> &vector, const unsigned int seed) { - auto lower_limit = static_cast<double>(kTestDataLowerLimit); - auto upper_limit = static_cast<double>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<double> dist(lower_limit, upper_limit); +void PopulateVector(std::vector<double2> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); } } // Specialized versions of the above for half-precision template <> -void PopulateVector(std::vector<half> &vector, const unsigned int seed) { - const auto lower_limit = static_cast<float>(kTestDataLowerLimit); - const auto upper_limit = static_cast<float>(kTestDataUpperLimit); - std::mt19937 mt(seed); - std::uniform_real_distribution<float> dist(lower_limit, upper_limit); - for (auto &element: vector) { element = FloatToHalf(dist(mt)); } +void PopulateVector(std::vector<half> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist) { + for (auto &element: vector) { element = FloatToHalf(static_cast<float>(dist(mt))); } } // ================================================================================================= diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp index 3c9be6a2..b3db8c22 100644 --- a/src/utilities/utilities.hpp +++ b/src/utilities/utilities.hpp @@ -20,6 +20,7 @@ #include <string> #include <functional> #include <complex> +#include <random> #include "clpp11.hpp" #include "clblast.h" @@ -72,6 +73,7 @@ constexpr auto kArgAsumOffset = "offasum"; constexpr auto kArgImaxOffset = "offimax"; constexpr auto kArgAlpha = "alpha"; constexpr auto kArgBeta = "beta"; +constexpr auto kArgBatchCount = "batch_num"; // The tuner-specific arguments in string form constexpr auto kArgFraction = "fraction"; @@ -155,6 +157,16 @@ struct Arguments { size_t imax_offset = 0; T alpha = ConstantOne<T>(); T beta = ConstantOne<T>(); + // Batch-specific arguments + size_t batch_count = 1; + std::vector<size_t> x_offsets = {0}; + std::vector<size_t> y_offsets = {0}; + std::vector<size_t> a_offsets = {0}; + std::vector<size_t> b_offsets = {0}; + std::vector<size_t> c_offsets = {0}; + std::vector<T> alphas = {ConstantOne<T>()}; + std::vector<T> betas = {ConstantOne<T>()}; + // Sizes size_t x_size = 1; size_t y_size = 1; size_t a_size = 1; @@ -234,7 +246,7 @@ constexpr auto kTestDataUpperLimit = 2.0; // Populates a vector with random data template <typename T> -void PopulateVector(std::vector<T> &vector, const unsigned int seed); +void PopulateVector(std::vector<T> &vector, std::mt19937 &mt, std::uniform_real_distribution<double> &dist); // ================================================================================================= diff --git a/test/correctness/misc/override_parameters.cpp b/test/correctness/misc/override_parameters.cpp index a4cecf0d..e6eebef7 100644 --- a/test/correctness/misc/override_parameters.cpp +++ b/test/correctness/misc/override_parameters.cpp @@ -11,11 +11,14 @@ // // ================================================================================================= +#include <string> +#include <vector> +#include <unordered_map> +#include <random> + #include "utilities/utilities.hpp" #include "test/routines/level3/xgemm.hpp" -#include <unordered_map> - namespace clblast { // ================================================================================================= @@ -71,9 +74,11 @@ size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::st auto host_a = std::vector<T>(args.m * args.k); auto host_b = std::vector<T>(args.n * args.k); auto host_c = std::vector<T>(args.m * args.n); - PopulateVector(host_a, kSeed); - PopulateVector(host_b, kSeed); - PopulateVector(host_c, kSeed); + std::mt19937 mt(kSeed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + PopulateVector(host_a, mt, dist); + PopulateVector(host_b, mt, dist); + PopulateVector(host_c, mt, dist); // Copy the matrices to the device auto device_a = Buffer<T>(context, host_a.size()); diff --git a/test/correctness/routines/levelx/xaxpybatched.cpp b/test/correctness/routines/levelx/xaxpybatched.cpp new file mode 100644 index 00000000..a106440f --- /dev/null +++ b/test/correctness/routines/levelx/xaxpybatched.cpp @@ -0,0 +1,30 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "test/correctness/testblas.hpp" +#include "test/routines/levelx/xaxpybatched.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + auto errors = size_t{0}; + errors += clblast::RunTests<clblast::TestXaxpyBatched<float>, float, float>(argc, argv, false, "SAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<double>, double, double>(argc, argv, true, "DAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv, true, "CAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv, true, "ZAXPYBATCHED"); + errors += clblast::RunTests<clblast::TestXaxpyBatched<half>, half, half>(argc, argv, true, "HAXPYBATCHED"); + if (errors > 0) { return 1; } else { return 0; } +} + +// ================================================================================================= diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp index d959ce18..56376d0b 100644 --- a/test/correctness/testblas.cpp +++ b/test/correctness/testblas.cpp @@ -13,7 +13,9 @@ #include <algorithm> #include <iostream> +#include <random> +#include "utilities/utilities.hpp" #include "test/correctness/testblas.hpp" namespace clblast { @@ -25,6 +27,7 @@ template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncr template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 }; template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBandSizes = { 4, 19 }; +template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kBatchCounts = { 1, 3 }; // Test settings for the invalid tests template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kInvalidIncrements = { 0, 1 }; @@ -79,22 +82,25 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end()); const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end()); const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end()); + const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end()); // Creates test input data - x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - y_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset); - a_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - b_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - c_source_.resize(std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); - ap_source_.resize(std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); - scalar_source_.resize(std::max(max_mat, max_matvec) + max_offset); - PopulateVector(x_source_, kSeed); - PopulateVector(y_source_, kSeed); - PopulateVector(a_source_, kSeed); - PopulateVector(b_source_, kSeed); - PopulateVector(c_source_, kSeed); - PopulateVector(ap_source_, kSeed); - PopulateVector(scalar_source_, kSeed); + x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset); + y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset); + a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset); + ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset); + scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset); + std::mt19937 mt(kSeed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + PopulateVector(x_source_, mt, dist); + PopulateVector(y_source_, mt, dist); + PopulateVector(a_source_, mt, dist); + PopulateVector(b_source_, mt, dist); + PopulateVector(c_source_, mt, dist); + PopulateVector(ap_source_, mt, dist); + PopulateVector(scalar_source_, mt, dist); } // =============================================================================================== @@ -190,15 +196,15 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st auto result2 = get_result_(args, buffers2, queue_); // Computes the L2 error - const auto kErrorMarginL2 = getL2ErrorMargin<T>(); auto l2error = 0.0; + const auto kErrorMarginL2 = getL2ErrorMargin<T>(); for (auto id1=size_t{0}; id1<get_id1_(args); ++id1) { for (auto id2=size_t{0}; id2<get_id2_(args); ++id2) { auto index = get_index_(args, id1, id2); l2error += SquaredDifference(result1[index], result2[index]); } } - l2error /= (get_id1_(args) * get_id2_(args)); + l2error /= static_cast<double>(get_id1_(args) * get_id2_(args)); // Checks for differences in the output auto errors = size_t{0}; @@ -219,8 +225,10 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st } } } + + // Report the results if (verbose_ && errors > 0) { - fprintf(stdout, "\n Combined L2 error: %.2e\n ", l2error); + fprintf(stdout, "\n Combined average L2 error: %.2e\n ", l2error); } // Tests the error count (should be zero) diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp index ee795aad..42e8aef7 100644 --- a/test/correctness/testblas.hpp +++ b/test/correctness/testblas.hpp @@ -56,6 +56,7 @@ class TestBlas: public Tester<T,U> { static const std::vector<size_t> kMatrixDims; static const std::vector<size_t> kMatrixVectorDims; static const std::vector<size_t> kBandSizes; + static const std::vector<size_t> kBatchCounts; const std::vector<size_t> kOffsets; const std::vector<U> kAlphaValues; const std::vector<U> kBetaValues; @@ -183,6 +184,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na auto imax_offsets = std::vector<size_t>{args.imax_offset}; auto alphas = std::vector<U>{args.alpha}; auto betas = std::vector<U>{args.beta}; + auto batch_counts = std::vector<size_t>{args.batch_count}; auto x_sizes = std::vector<size_t>{args.x_size}; auto y_sizes = std::vector<size_t>{args.y_size}; auto a_sizes = std::vector<size_t>{args.a_size}; @@ -226,6 +228,7 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na if (option == kArgImaxOffset) { imax_offsets = tester.kOffsets; } if (option == kArgAlpha) { alphas = tester.kAlphaValues; } if (option == kArgBeta) { betas = tester.kBetaValues; } + if (option == kArgBatchCount) { batch_counts = tester.kBatchCounts; } if (option == kArgXOffset) { x_sizes = tester.kVecSizes; } if (option == kArgYOffset) { y_sizes = tester.kVecSizes; } @@ -268,8 +271,10 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na for (auto &imax_offset: imax_offsets) { r_args.imax_offset = imax_offset; for (auto &alpha: alphas) { r_args.alpha = alpha; for (auto &beta: betas) { r_args.beta = beta; - C::SetSizes(r_args); - regular_test_vector.push_back(r_args); + for (auto &batch_count: batch_counts) { r_args.batch_count = batch_count; + C::SetSizes(r_args); + regular_test_vector.push_back(r_args); + } } } } diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp index cbfc5bb2..40784fdb 100644 --- a/test/correctness/tester.cpp +++ b/test/correctness/tester.cpp @@ -367,6 +367,7 @@ std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) { if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; } if (o == kArgAlpha) { result += kArgAlpha + equals + ToString(args.alpha) + " "; } if (o == kArgBeta) { result += kArgBeta + equals + ToString(args.beta) + " "; } + if (o == kArgBatchCount){result += kArgBatchCount + equals + ToString(args.batch_count) + " "; } } return result; } diff --git a/test/performance/client.cpp b/test/performance/client.cpp index 2c45b35e..bd48b047 100644 --- a/test/performance/client.cpp +++ b/test/performance/client.cpp @@ -11,13 +11,15 @@ // // ================================================================================================= -#include "test/performance/client.hpp" - #include <string> #include <vector> #include <utility> #include <algorithm> #include <chrono> +#include <random> + +#include "utilities/utilities.hpp" +#include "test/performance/client.hpp" namespace clblast { // ================================================================================================= @@ -89,6 +91,9 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le if (o == kArgAsumOffset) { args.asum_offset = GetArgument(command_line_args, help, kArgAsumOffset, size_t{0}); } if (o == kArgImaxOffset) { args.imax_offset = GetArgument(command_line_args, help, kArgImaxOffset, size_t{0}); } + // Batch arguments + if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, size_t{1}); } + // Scalar values if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<U>()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<U>()); } @@ -179,13 +184,15 @@ void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) std::vector<T> c_source(args.c_size); std::vector<T> ap_source(args.ap_size); std::vector<T> scalar_source(args.scalar_size); - PopulateVector(x_source, kSeed); - PopulateVector(y_source, kSeed); - PopulateVector(a_source, kSeed); - PopulateVector(b_source, kSeed); - PopulateVector(c_source, kSeed); - PopulateVector(ap_source, kSeed); - PopulateVector(scalar_source, kSeed); + std::mt19937 mt(kSeed); + std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); + PopulateVector(x_source, mt, dist); + PopulateVector(y_source, mt, dist); + PopulateVector(a_source, mt, dist); + PopulateVector(b_source, mt, dist); + PopulateVector(c_source, mt, dist); + PopulateVector(ap_source, mt, dist); + PopulateVector(scalar_source, mt, dist); // Creates the matrices on the device auto x_vec = Buffer<T>(context, args.x_size); @@ -335,6 +342,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args, else if (o == kArgNrm2Offset){integers.push_back(args.nrm2_offset); } else if (o == kArgAsumOffset){integers.push_back(args.asum_offset); } else if (o == kArgImaxOffset){integers.push_back(args.imax_offset); } + else if (o == kArgBatchCount){integers.push_back(args.batch_count); } } auto strings = std::vector<std::string>{}; for (auto &o: options_) { diff --git a/test/performance/routines/levelx/xaxpybatched.cpp b/test/performance/routines/levelx/xaxpybatched.cpp new file mode 100644 index 00000000..6d3bcb51 --- /dev/null +++ b/test/performance/routines/levelx/xaxpybatched.cpp @@ -0,0 +1,37 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// ================================================================================================= + +#include "test/performance/client.hpp" +#include "test/routines/levelx/xaxpybatched.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); + switch(clblast::GetPrecision(command_line_args, clblast::Precision::kSingle)) { + case clblast::Precision::kHalf: + clblast::RunClient<clblast::TestXaxpyBatched<half>, half, half>(argc, argv); break; + case clblast::Precision::kSingle: + clblast::RunClient<clblast::TestXaxpyBatched<float>, float, float>(argc, argv); break; + case clblast::Precision::kDouble: + clblast::RunClient<clblast::TestXaxpyBatched<double>, double, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: + clblast::RunClient<clblast::TestXaxpyBatched<float2>, float2, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: + clblast::RunClient<clblast::TestXaxpyBatched<double2>, double2, double2>(argc, argv); break; + } + return 0; +} + +// ================================================================================================= diff --git a/test/routines/levelx/xaxpybatched.hpp b/test/routines/levelx/xaxpybatched.hpp new file mode 100644 index 00000000..ee15ff92 --- /dev/null +++ b/test/routines/levelx/xaxpybatched.hpp @@ -0,0 +1,168 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file implements a class with static methods to describe the XaxpyBatched routine. Examples of +// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These +// static methods are used by the correctness tester and the performance tester. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ +#define CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ + +#include <vector> +#include <string> + +#include "utilities/utilities.hpp" + +#ifdef CLBLAST_REF_CLBLAS + #include "test/wrapper_clblas.hpp" +#endif +#ifdef CLBLAST_REF_CBLAS + #include "test/wrapper_cblas.hpp" +#endif + +namespace clblast { +// ================================================================================================= + +// See comment at top of file for a description of the class +template <typename T> +class TestXaxpyBatched { + public: + + // Although it is a non-BLAS routine, it can still be tested against level-1 routines in a loop + static size_t BLASLevel() { return 1; } + + // The list of arguments relevant for this routine + static std::vector<std::string> GetOptions() { + return {kArgN, + kArgXInc, kArgYInc, + kArgBatchCount, kArgAlpha}; + } + + // Helper for the sizes per batch + static size_t PerBatchSizeX(const Arguments<T> &args) { return args.n * args.x_inc; } + static size_t PerBatchSizeY(const Arguments<T> &args) { return args.n * args.y_inc; } + + // Describes how to obtain the sizes of the buffers + static size_t GetSizeX(const Arguments<T> &args) { + return PerBatchSizeX(args) * args.batch_count + args.x_offset; + } + static size_t GetSizeY(const Arguments<T> &args) { + return PerBatchSizeY(args) * args.batch_count + args.y_offset; + } + + // Describes how to set the sizes of all the buffers + static void SetSizes(Arguments<T> &args) { + args.x_size = GetSizeX(args); + args.y_size = GetSizeY(args); + + // Also sets the batch-related variables + args.x_offsets = std::vector<size_t>(args.batch_count); + args.y_offsets = std::vector<size_t>(args.batch_count); + args.alphas = std::vector<T>(args.batch_count); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + args.x_offsets[batch] = batch * PerBatchSizeX(args) + args.x_offset; + args.y_offsets[batch] = batch * PerBatchSizeY(args) + args.y_offset; + args.alphas[batch] = args.alpha + Constant<T>(batch); + } + } + + // Describes what the default values of the leading dimensions of the matrices are + static size_t DefaultLDA(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDB(const Arguments<T> &) { return 1; } // N/A for this routine + static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine + + // Describes which transpose options are relevant for this routine + using Transposes = std::vector<Transpose>; + static Transposes GetATransposes(const Transposes &) { return {}; } // N/A for this routine + static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine + + // Describes how to prepare the input data + static void PrepareData(const Arguments<T>&, Queue&, const int, std::vector<T>&, + std::vector<T>&, std::vector<T>&, std::vector<T>&, std::vector<T>&, + std::vector<T>&, std::vector<T>&) {} // N/A for this routine + + // Describes how to run the CLBlast routine + static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + auto event = cl_event{}; + auto status = AxpyBatched(args.n, args.alphas.data(), + buffers.x_vec(), args.x_offsets.data(), args.x_inc, + buffers.y_vec(), args.y_offsets.data(), args.y_inc, + args.batch_count, + &queue_plain, &event); + if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } + return status; + } + + // Describes how to run the clBLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CLBLAS + static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + auto queue_plain = queue(); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + auto event = cl_event{}; + auto status = clblasXaxpy(args.n, args.alphas[batch], + buffers.x_vec, args.x_offsets[batch], args.x_inc, + buffers.y_vec, args.y_offsets[batch], args.y_inc, + 1, &queue_plain, 0, nullptr, &event); + clWaitForEvents(1, &event); + if (static_cast<StatusCode>(status) != StatusCode::kSuccess) { + return static_cast<StatusCode>(status); + } + } + return StatusCode::kSuccess; + } + #endif + + // Describes how to run the CPU BLAS routine (for correctness/performance comparison) + #ifdef CLBLAST_REF_CBLAS + static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> x_vec_cpu(args.x_size, static_cast<T>(0)); + std::vector<T> y_vec_cpu(args.y_size, static_cast<T>(0)); + buffers.x_vec.Read(queue, args.x_size, x_vec_cpu); + buffers.y_vec.Read(queue, args.y_size, y_vec_cpu); + for (auto batch = size_t{0}; batch < args.batch_count; ++batch) { + cblasXaxpy(args.n, args.alphas[batch], + x_vec_cpu, args.x_offsets[batch], args.x_inc, + y_vec_cpu, args.y_offsets[batch], args.y_inc); + } + buffers.y_vec.Write(queue, args.y_size, y_vec_cpu); + return StatusCode::kSuccess; + } + #endif + + // Describes how to download the results of the computation + static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) { + std::vector<T> result(args.y_size, static_cast<T>(0)); + buffers.y_vec.Read(queue, args.y_size, result); + return result; + } + + // Describes how to compute the indices of the result buffer + static size_t ResultID1(const Arguments<T> &args) { return args.n; } + static size_t ResultID2(const Arguments<T> &args) { return args.batch_count; } + static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) { + return (id1 * args.y_inc) + args.y_offsets[id2]; + } + + // Describes how to compute performance metrics + static size_t GetFlops(const Arguments<T> &args) { + return args.batch_count * (2 * args.n); + } + static size_t GetBytes(const Arguments<T> &args) { + return args.batch_count * (3 * args.n) * sizeof(T); + } +}; + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_ROUTINES_XAXPYBATCHED_H_ +#endif diff --git a/test/routines/levelx/xinvert.hpp b/test/routines/levelx/xinvert.hpp index 05bea9aa..b470dbf3 100644 --- a/test/routines/levelx/xinvert.hpp +++ b/test/routines/levelx/xinvert.hpp @@ -19,7 +19,7 @@ #include <vector> #include <string> -#include "routines/levelx/xinvert.hpp" +#include "utilities/utilities.hpp" namespace clblast { // ================================================================================================= |