From 59183b7d79b70d918562d5048e521633d425ca1c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:21:49 +0200 Subject: Sets the proper sizes for the buffers for the Netlib CBLAS API --- scripts/generator/generator.py | 127 +++++++---- src/clblast_blas.cpp | 500 ++++++++++++++++++++--------------------- 2 files changed, 331 insertions(+), 296 deletions(-) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 4ba97ff8..99edf355 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -59,6 +59,41 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas cld_m = "The value of `c_ld` must be at least `m`." cld_n = "The value of `c_ld` must be at least `n`." + +# Helper functions to compute vector and matrix sizes +def size_helper(condition, size_one, size_two, multiplier): + length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier + return length + + +def layout_transpose_condition(prefix): + return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\ + "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)" + + +# Different possibilities for the vector and matrix sizes +xn = "n * x_inc" +xm = "m * x_inc" +yn = "n * y_inc" +ym = "m * y_inc" +an = "n * a_ld" +apn = "((n*(n+1)) / 2)" +cn = "n * c_ld" +xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc") +ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc") +amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld") +amns = size_helper("side == Side::kLeft", "m", "n", "a_ld") +amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") +ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") +ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") +bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") +bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") +bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld") +bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") +cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld") +ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld") +bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld") + # ================================================================================================== # Populates a list of routines @@ -66,63 +101,63 @@ ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []), Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 286b1ba8..b5451049 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); -- cgit v1.2.3