summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/generator/generator.py127
-rw-r--r--src/clblast_blas.cpp500
2 files changed, 331 insertions, 296 deletions
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 4ba97ff8..99edf355 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -59,6 +59,41 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas
cld_m = "The value of `c_ld` must be at least `m`."
cld_n = "The value of `c_ld` must be at least `n`."
+
+# Helper functions to compute vector and matrix sizes
+def size_helper(condition, size_one, size_two, multiplier):
+ length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier
+ return length
+
+
+def layout_transpose_condition(prefix):
+ return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\
+ "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)"
+
+
+# Different possibilities for the vector and matrix sizes
+xn = "n * x_inc"
+xm = "m * x_inc"
+yn = "n * y_inc"
+ym = "m * y_inc"
+an = "n * a_ld"
+apn = "((n*(n+1)) / 2)"
+cn = "n * c_ld"
+xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc")
+ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc")
+amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld")
+amns = size_helper("side == Side::kLeft", "m", "n", "a_ld")
+amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld")
+ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld")
+ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld")
+bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld")
+bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld")
+bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld")
+bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld")
+cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld")
+ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld")
+bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld")
+
# ==================================================================================================
# Populates a list of routines
@@ -66,63 +101,63 @@ ROUTINES = [
[ # Level 1: vector-vector
Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []),
Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []),
- Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
+ Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []),
Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []),
- Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
- Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
- Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
- Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+ Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+ Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+ Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+ Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
- Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
- Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
- Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
- Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
- Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
- Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+ Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+ Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+ Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+ Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+ Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+ Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
- Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
- Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
- Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
- Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
- Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
- Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
- Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
- Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
- Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []),
- Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
- Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []),
+ Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+ Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+ Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+ Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+ Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+ Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+ Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+ Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+ Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []),
+ Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
+ Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
- Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
- Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
- Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
- Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
- Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
- Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
- Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
- Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+ Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+ Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+ Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+ Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+ Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+ Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+ Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
],
[ # Level 3: matrix-matrix
- Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
- Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
- Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
- Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
- Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
- Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+ Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+ Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+ Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+ Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+ Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+ Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []),
],
[ # Level X: extra routines (not part of BLAS)
- Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+ Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
]]
diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp
index 286b1ba8..b5451049 100644
--- a/src/clblast_blas.cpp
+++ b/src/clblast_blas.cpp
@@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
@@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
@@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
@@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
@@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
@@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
@@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
@@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
@@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
@@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
@@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
@@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
@@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
@@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
@@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
@@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
@@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));