diff options
Diffstat (limited to 'src/clblast_blas.cpp')
-rw-r--r-- | src/clblast_blas.cpp | 500 |
1 files changed, 250 insertions, 250 deletions
diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 286b1ba8..b5451049 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<float>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<double>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<float2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<double2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<float>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<double>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<float2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer<double2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); @@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); @@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); @@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); @@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); @@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); @@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); @@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); @@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); @@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); @@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); @@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); @@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); @@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<float*>(x)); @@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<double*>(x)); @@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const float2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<float2*>(x)); @@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double2>(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast<const double2*>(ap)); x_buffer.Write(queue, x_size, reinterpret_cast<double2*>(x)); @@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y)); @@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y)); @@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y)); @@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y)); @@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y)); @@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y)); @@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); a_buffer.Write(queue, a_size, reinterpret_cast<float2*>(a)); @@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); a_buffer.Write(queue, a_size, reinterpret_cast<double2*>(a)); @@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float2>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast<float2*>(ap)); @@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double2>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast<double2*>(ap)); @@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y)); @@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y)); @@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float2>(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float2>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float2*>(y)); @@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double2>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double2>(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double2>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double2*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double2*>(y)); @@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); a_buffer.Write(queue, a_size, reinterpret_cast<float*>(a)); @@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); a_buffer.Write(queue, a_size, reinterpret_cast<double*>(a)); @@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast<float*>(ap)); @@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast<double*>(ap)); @@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<float>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y)); @@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer<double>(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y)); @@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<float>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<float>(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<float>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const float*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const float*>(y)); @@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer<double>(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer<double>(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer<double>(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast<const double*>(x)); y_buffer.Write(queue, y_size, reinterpret_cast<const double*>(y)); @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer<float>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<float>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b)); @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer<double>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<double>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b)); @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer<float2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b)); @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer<double2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b)); @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<float>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<float>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b)); @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<double>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<double>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b)); @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<float2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b)); @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<double2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b)); @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<float2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b)); @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer<double2>(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b)); @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<float*>(c)); @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<double*>(c)); @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c)); @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c)); @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<float2*>(c)); @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); c_buffer.Write(queue, c_size, reinterpret_cast<double2*>(c)); @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<float>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float*>(b)); @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<double>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double*>(b)); @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast<const float*>(beta)[0], reinterpret_cast<const float*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<float2>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b)); @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast<const double*>(beta)[0], reinterpret_cast<const double*>(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<double2>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b)); @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<float2>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<float2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const float2*>(b)); @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer<double2>(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer<double2>(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<const double2*>(b)); @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<float>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b)); @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<double>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b)); @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<float2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b)); @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<double2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b)); @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<float>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b)); @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<double>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b)); @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<float2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b)); @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer<double2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b)); @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer<float>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float*>(b)); @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer<double>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double*>(b)); @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast<const float*>(alpha)[0], reinterpret_cast<const float*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<float2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer<float2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const float2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<float2*>(b)); @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast<const double*>(alpha)[0], reinterpret_cast<const double*>(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer<double2>(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer<double2>(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast<const double2*>(a)); b_buffer.Write(queue, b_size, reinterpret_cast<double2*>(b)); |