diff options
-rw-r--r-- | CHANGELOG | 4 | ||||
-rw-r--r-- | doc/clblast.md | 41 | ||||
-rw-r--r-- | include/clblast.h | 20 | ||||
-rw-r--r-- | include/clblast_c.h | 61 | ||||
-rw-r--r-- | scripts/generator/generator.py | 35 | ||||
-rw-r--r-- | src/clblast.cc | 61 | ||||
-rw-r--r-- | src/clblast_c.cc | 102 | ||||
-rw-r--r-- | test/wrapper_cblas.h | 2 | ||||
-rw-r--r-- | test/wrapper_clblas.h | 2 |
9 files changed, 278 insertions, 50 deletions
@@ -1,6 +1,8 @@ Development version (next release) -- Added support for half-precision floating-point (fp16) routines +- Added support for half-precision floating-point (fp16) in the library +- Added half-precision routines: + * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs diff --git a/doc/clblast.md b/doc/clblast.md index 4b36789c..77654d2b 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -34,6 +34,10 @@ StatusCode CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SWAP: @@ -82,6 +86,10 @@ StatusCode CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SCAL: @@ -128,6 +136,10 @@ StatusCode CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to COPY: @@ -230,6 +242,11 @@ StatusCode CLBlastDdot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to DOT: @@ -376,6 +393,10 @@ StatusCode CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to NRM2: @@ -425,6 +446,10 @@ StatusCode CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to ASUM: @@ -474,6 +499,10 @@ StatusCode CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to SUM: @@ -523,6 +552,10 @@ StatusCode CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to AMAX: @@ -572,6 +605,10 @@ StatusCode CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to MAX: @@ -621,6 +658,10 @@ StatusCode CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) ``` Arguments to MIN: diff --git a/include/clblast.h b/include/clblast.h index 74ed6ab2..28c984c0 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -121,21 +121,21 @@ StatusCode Rotm(const size_t n, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template <typename T> StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template <typename T> StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template <typename T> StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT template <typename T> StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template <typename T> StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template <typename T> StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template <typename T> StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template <typename T> StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template <typename T> StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template <typename T> StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, diff --git a/include/clblast_c.h b/include/clblast_c.h index e36eb68a..4dd76eb6 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -148,7 +148,7 @@ StatusCode PUBLIC_API CLBlastDrotm(const size_t n, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP StatusCode PUBLIC_API CLBlastSswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, @@ -165,8 +165,12 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL StatusCode PUBLIC_API CLBlastSscal(const size_t n, const float alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -183,8 +187,12 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n, const cl_double2 alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY StatusCode PUBLIC_API CLBlastScopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, @@ -201,6 +209,10 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY StatusCode PUBLIC_API CLBlastSaxpy(const size_t n, @@ -229,7 +241,7 @@ StatusCode PUBLIC_API CLBlastHaxpy(const size_t n, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT StatusCode PUBLIC_API CLBlastSdot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -240,6 +252,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event); // Dot product of two complex vectors: CDOTU/ZDOTU StatusCode PUBLIC_API CLBlastCdotu(const size_t n, @@ -265,7 +282,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 StatusCode PUBLIC_API CLBlastSnrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -282,8 +299,12 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM StatusCode PUBLIC_API CLBlastSasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -300,8 +321,12 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM StatusCode PUBLIC_API CLBlastSsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -318,8 +343,12 @@ StatusCode PUBLIC_API CLBlastDzsum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX StatusCode PUBLIC_API CLBlastiSamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -336,8 +365,12 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX StatusCode PUBLIC_API CLBlastiSmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -354,8 +387,12 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN StatusCode PUBLIC_API CLBlastiSmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -372,6 +409,10 @@ StatusCode PUBLIC_API CLBlastiZmin(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index bc8fa783..927c64f0 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,6 +42,7 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6 # Special cases Sc = DataType("C", "Sc", FLT2, [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output Dz = DataType("Z", "Dz", DBL2, [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output +iH = DataType("H", "iH", HLF, [HLF, HLF, HLF, HLF], HLF ) # As H, but with integer output iS = DataType("S", "iS", FLT, [FLT, FLT, FLT, FLT], FLT ) # As S, but with integer output iD = DataType("D", "iD", DBL, [DBL, DBL, DBL, DBL], DBL ) # As D, but with integer output iC = DataType("C", "iC", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output @@ -61,23 +62,23 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # Populates a list of routines routines = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []), diff --git a/src/clblast.cc b/src/clblast.cc index c18dc0a9..098ff7f3 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -160,7 +160,7 @@ template StatusCode PUBLIC_API Rotm<double>(const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template <typename T> StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -190,8 +190,12 @@ template StatusCode PUBLIC_API Swap<double2>(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Swap<half>(const size_t, + cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template <typename T> StatusCode Scal(const size_t n, const T alpha, @@ -221,8 +225,12 @@ template StatusCode PUBLIC_API Scal<double2>(const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Scal<half>(const size_t, + const half, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template <typename T> StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, @@ -252,6 +260,10 @@ template StatusCode PUBLIC_API Copy<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Copy<half>(const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template <typename T> @@ -295,7 +307,7 @@ template StatusCode PUBLIC_API Axpy<half>(const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Dot product of two vectors: SDOT/DDOT +// Dot product of two vectors: SDOT/DDOT/HDOT template <typename T> StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, @@ -321,6 +333,11 @@ template StatusCode PUBLIC_API Dot<double>(const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Dot<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template <typename T> @@ -376,7 +393,7 @@ template StatusCode PUBLIC_API Dotc<double2>(const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2 +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template <typename T> StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, @@ -406,8 +423,12 @@ template StatusCode PUBLIC_API Nrm2<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Nrm2<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template <typename T> StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, @@ -437,8 +458,12 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Asum<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template <typename T> StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, @@ -468,8 +493,12 @@ template StatusCode PUBLIC_API Sum<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Sum<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template <typename T> StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -499,8 +528,12 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Amax<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template <typename T> StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, @@ -530,8 +563,12 @@ template StatusCode PUBLIC_API Max<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Max<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template <typename T> StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, @@ -561,6 +598,10 @@ template StatusCode PUBLIC_API Min<double2>(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Min<half>(const size_t, + cl_mem, const size_t, + const cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 7642a1e4..fd37462f 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -178,6 +178,16 @@ StatusCode CLBlastZswap(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHswap(const size_t n, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Swap<half>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} // SCAL StatusCode CLBlastSscal(const size_t n, @@ -220,6 +230,16 @@ StatusCode CLBlastZscal(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHscal(const size_t n, + const cl_half alpha, + cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Scal(n, + alpha, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // COPY StatusCode CLBlastScopy(const size_t n, @@ -262,6 +282,16 @@ StatusCode CLBlastZcopy(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHcopy(const size_t n, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Copy<half>(n, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} // AXPY StatusCode CLBlastSaxpy(const size_t n, @@ -350,6 +380,18 @@ StatusCode CLBlastDdot(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHdot(const size_t n, + cl_mem dot_buffer, const size_t dot_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Dot<half>(n, + dot_buffer, dot_offset, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + queue, event); + return static_cast<StatusCode>(status); +} // DOTU StatusCode CLBlastCdotu(const size_t n, @@ -444,6 +486,16 @@ StatusCode CLBlastDznrm2(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHnrm2(const size_t n, + cl_mem nrm2_buffer, const size_t nrm2_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Nrm2<half>(n, + nrm2_buffer, nrm2_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // ASUM StatusCode CLBlastSasum(const size_t n, @@ -486,6 +538,16 @@ StatusCode CLBlastDzasum(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHasum(const size_t n, + cl_mem asum_buffer, const size_t asum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Asum<half>(n, + asum_buffer, asum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // SUM StatusCode CLBlastSsum(const size_t n, @@ -528,6 +590,16 @@ StatusCode CLBlastDzsum(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHsum(const size_t n, + cl_mem sum_buffer, const size_t sum_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Sum<half>(n, + sum_buffer, sum_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // AMAX StatusCode CLBlastiSamax(const size_t n, @@ -570,6 +642,16 @@ StatusCode CLBlastiZamax(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastiHamax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Amax<half>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // MAX StatusCode CLBlastiSmax(const size_t n, @@ -612,6 +694,16 @@ StatusCode CLBlastiZmax(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastiHmax(const size_t n, + cl_mem imax_buffer, const size_t imax_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Max<half>(n, + imax_buffer, imax_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // MIN StatusCode CLBlastiSmin(const size_t n, @@ -654,6 +746,16 @@ StatusCode CLBlastiZmin(const size_t n, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastiHmin(const size_t n, + cl_mem imin_buffer, const size_t imin_offset, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Min<half>(n, + imin_buffer, imin_offset, + x_buffer, x_offset, x_inc, + queue, event); + return static_cast<StatusCode>(status); +} // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 529acfbf..3182fdfc 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -374,7 +374,7 @@ void cblasXasum(const size_t n, reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc))); } -// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void cblasXamax(const size_t n, std::vector<float>& imax_buffer, const size_t imax_offset, const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) { diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 23c55373..b9410cae 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -633,7 +633,7 @@ clblasStatus clblasXasum<double2>(const size_t n, num_queues, queues, num_wait_events, wait_events, events); } -// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX +// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template <typename T> clblasStatus clblasXamax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, |