summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-10-25 19:21:49 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-10-25 19:21:49 +0200
commit59183b7d79b70d918562d5048e521633d425ca1c (patch)
tree351537cb60845a779fb94a3f005d27234f569386 /src
parentf96fd372bc3087938572ebc55bd1d8e1b7e6f18a (diff)
Sets the proper sizes for the buffers for the Netlib CBLAS API
Diffstat (limited to 'src')
-rw-r--r--src/clblast_blas.cpp500
1 files changed, 250 insertions, 250 deletions
diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp
index 286b1ba8..b5451049 100644
--- a/src/clblast_blas.cpp
+++ b/src/clblast_blas.cpp
@@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
@@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
@@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
@@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
@@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x));
@@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x));
@@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x));
@@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a
auto device = get_device();
auto context = Context(device);
auto queue = Queue(context, device);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap));
x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x));
@@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = m * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a));
@@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a));
@@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap));
@@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap));
@@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float2>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y));
@@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double2>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double2>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double2>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y));
@@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a));
@@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a));
@@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap));
@@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap));
@@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto a_size = n;
+ const auto a_size = n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<float>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<float>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<float>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y));
@@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto x_size = n;
+ const auto x_size = n * x_inc;
auto x_buffer = Buffer<double>(context, x_size);
- const auto y_size = n;
+ const auto y_size = n * y_inc;
auto y_buffer = Buffer<double>(context, y_size);
- const auto ap_size = n;
+ const auto ap_size = ((n*(n+1)) / 2);
auto ap_buffer = Buffer<double>(context, ap_size);
x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x));
y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y));
@@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle,
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c));
@@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c));
@@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
@@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
@@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c));
@@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c));
@@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b));
@@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b));
@@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]};
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<float2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b));
@@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
const auto beta_cpp = beta;
- const auto a_size = n;
+ const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
- const auto c_size = n;
+ const auto c_size = n * c_ld;
auto c_buffer = Buffer<double2>(context, c_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b));
@@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
@@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));
@@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<float>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b));
@@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = alpha;
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<double>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b));
@@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<float2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<float2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b));
@@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose,
auto context = Context(device);
auto queue = Queue(context, device);
const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]};
- const auto a_size = n;
+ const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld;
auto a_buffer = Buffer<double2>(context, a_size);
- const auto b_size = n;
+ const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld;
auto b_buffer = Buffer<double2>(context, b_size);
a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a));
b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b));