From a17b714c3e2fee6e8c30bc2506eb284d1ee3ce31 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 5 Oct 2016 00:09:39 +0200 Subject: Added first version of Netlib BLAS API header --- scripts/generator/generator.py | 12 +++++-- scripts/generator/generator/cpp.py | 40 ++++++++++++++++++++++ scripts/generator/generator/datatype.py | 5 +++ scripts/generator/generator/routine.py | 59 +++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 2 deletions(-) mode change 100644 => 100755 scripts/generator/generator.py (limited to 'scripts') diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py old mode 100644 new mode 100755 index d82b13a6..68ae9cbe --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,6 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp +# clblast_blas.h +# clblast_blas.cpp # wrapper_clblas.h # wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in @@ -30,8 +32,8 @@ from generator.routine import Routine from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU -HEADER_LINES = [96, 73, 97, 22, 29, 41] -FOOTER_LINES = [17, 75, 19, 14, 6, 6] +HEADER_LINES = [96, 73, 97, 22, 29, 41, 43, 1] +FOOTER_LINES = [17, 75, 19, 14, 6, 6, 10, 1] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -132,6 +134,8 @@ def main(argv): library_root + "/src/clblast_c.cpp", library_root + "/test/wrapper_clblas.hpp", library_root + "/test/wrapper_cblas.hpp", + library_root + "/include/clblast_blas.h", + library_root + "/src/clblast_blas.cpp", ] # Checks whether the command-line arguments are valid; exists otherwise @@ -168,6 +172,10 @@ def main(argv): body += cpp.wrapper_clblas(routine) if i == 5: body += cpp.wrapper_cblas(routine) + if i == 6: + body += cpp.clblast_blas_h(routine) + if i == 7: + body += cpp.clblast_blas_cc(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 427eb180..83ddbcb2 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -90,6 +90,46 @@ def clblast_c_cc(routine): return result +def clblast_blas_h(routine): + """The Netlib CBLAS API header (.h)""" + result = NL + "// " + routine.description + ": " + routine.short_names() + NL + for flavour in routine.flavours: + result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + return result + + +def clblast_blas_cc(routine): + """The Netlib CBLAS API implementation (.cpp)""" + result = NL + "// " + routine.name.upper() + NL + for flavour in routine.flavours: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (26 + routine.length() + len(template)) + result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + + # Initialize OpenCL + result += " auto platform = Platform(size_t{0});" + NL + result += " auto device = Device(platform, size_t{0});" + NL + result += " auto context = Context(device);" + NL + result += " auto queue = Queue(context, device);" + NL + + # Copy data structures to the device + for name in routine.inputs + routine.outputs: + result += " " + routine.create_buffer(name, flavour.template, "0") + NL + for name in routine.inputs + routine.outputs: + result += " " + routine.write_buffer(name, "0") + NL + + # The function call + result += " auto status = clblast::" + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) + result += "," + NL + indent + "queue, event);" + NL + + # Copy back and clean-up + for name in routine.outputs: + result += " " + routine.read_buffer(name, "0") + NL + result += " return;" + NL + "}" + NL + return result + + def wrapper_clblas(routine): """The wrapper to the reference clBLAS routines (for performance/correctness testing)""" result = "" diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 9a6c6c02..29acc744 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -65,6 +65,11 @@ class DataType: return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) + def is_non_standard(self, scalar): + """Current scalar is of a non-standard type""" + return ((scalar == "alpha" and self.alpha_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2]) or + (scalar == "beta" and self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2])) + # Regular data-types H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF) # half (16) diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index a4e682c2..4870b861 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -65,6 +65,21 @@ class Routine: """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] + @staticmethod + def create_buffer(name, template, size): + """Creates a new CLCudaAPI buffer""" + return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");" + + @staticmethod + def write_buffer(name, size): + """Writes to a CLCudaAPI buffer""" + return name + "_buffer.Write(queue, " + size + ", " + name + ");" + + @staticmethod + def read_buffer(name, size): + """Reads from a CLCudaAPI buffer""" + return name + "_buffer.Read(queue, " + size + ", " + name + ");" + def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" buffers = self.inputs[:] # make a copy @@ -163,6 +178,16 @@ class Routine: return [", ".join(a + b + c)] return [] + def buffer_def_pointer(self, name, flavour): + """As above but as plain C pointer""" + prefix = "const " if name in self.inputs else "" + if name in self.inputs or name in self.outputs: + data_type = "void" if flavour.is_non_standard(name) else flavour.buffer_type + a = [prefix + data_type + "* " + name + ""] + c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] + return [", ".join(a + c)] + return [] + def buffer_clcudaapi(self, name): """As above but with CLCudaAPI buffers""" if name in self.inputs or name in self.outputs: @@ -288,6 +313,16 @@ class Routine: return ["const " + flavour.beta_cpp + " " + name] return [] + def scalar_def_void(self, name, flavour): + """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" + if name in self.scalars: + if name == "alpha": + data_type = "void*" if flavour.is_non_standard(name) else flavour.alpha_cpp + return ["const " + data_type + " " + name] + data_type = "void*" if flavour.is_non_standard(name) else flavour.beta_cpp + return ["const " + data_type + " " + name] + return [] + def scalar_type(self, name, flavour): """Retrieves the type of a scalar (alpha/beta)""" if name in self.scalars: @@ -316,6 +351,12 @@ class Routine: return [", ".join(["const size_t " + s for s in self.sizes])] return [] + def sizes_def_netlib(self): + """Retrieves the definition of the sizes (m,n,k) for the CBLAS API""" + if self.sizes: + return [", ".join(["const int " + s for s in self.sizes])] + return [] + def sizes_type(self): """Retrieves the types of the sizes (m,n,k)""" if self.sizes: @@ -453,6 +494,17 @@ class Routine: list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + def arguments_def_netlib(self, flavour): + """As above, but for the Netlib CBLAS API""" + return (self.options_def() + self.sizes_def_netlib() + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + + self.scalar_def_void("alpha", flavour) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + + self.scalar_def_void("beta", flavour) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + def arguments_def_wrapper_clblas(self, flavour): """As above, but clBLAS wrapper plain data-types""" return (self.options_def_wrapper_clblas() + self.sizes_def() + @@ -528,6 +580,13 @@ class Routine: result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)" return result + def routine_header_netlib(self, flavour, spaces, extra_qualifier): + """As above, but now for the original Netlib CBLAS API""" + indent = " " * (spaces + self.length()) + result = "void" + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" + return result + def routine_header_wrapper_clblas(self, flavour, def_only, spaces): """As above, but now for the clBLAS wrapper""" template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else "" -- cgit v1.2.3 From 8d5747aa54b88812ef4060328e3befdb13f3f45a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 5 Oct 2016 08:23:54 +0200 Subject: Made non-standard types void-pointers in the Netlib BLAS interface --- include/clblast_blas.h | 538 ++++++++++++++++---------------- scripts/generator/generator/datatype.py | 7 +- scripts/generator/generator/routine.py | 6 +- 3 files changed, 275 insertions(+), 276 deletions(-) (limited to 'scripts') diff --git a/include/clblast_blas.h b/include/clblast_blas.h index 41b03446..a5d0cc9c 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -98,14 +98,14 @@ void PUBLIC_API cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_cswap(const int n, - float2* x, const int x_inc, - float2* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zswap(const int n, - double2* x, const int x_inc, - double2* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_hswap(const int n, - half* x, const int x_inc, - half* y, const int y_inc); + void* x, const int x_inc, + void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, @@ -116,13 +116,13 @@ void PUBLIC_API cblas_dscal(const int n, double* x, const int x_inc); void PUBLIC_API cblas_cscal(const int n, const void* alpha, - float2* x, const int x_inc); + void* x, const int x_inc); void PUBLIC_API cblas_zscal(const int n, const void* alpha, - double2* x, const int x_inc); + void* x, const int x_inc); void PUBLIC_API cblas_hscal(const int n, const void* alpha, - half* x, const int x_inc); + void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, @@ -132,14 +132,14 @@ void PUBLIC_API cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc); void PUBLIC_API cblas_ccopy(const int n, - const float2* x, const int x_inc, - float2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zcopy(const int n, - const double2* x, const int x_inc, - double2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_hcopy(const int n, - const half* x, const int x_inc, - half* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, @@ -152,16 +152,16 @@ void PUBLIC_API cblas_daxpy(const int n, double* y, const int y_inc); void PUBLIC_API cblas_caxpy(const int n, const void* alpha, - const float2* x, const int x_inc, - float2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, - const double2* x, const int x_inc, - double2* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); void PUBLIC_API cblas_haxpy(const int n, const void* alpha, - const half* x, const int x_inc, - half* y, const int y_inc); + const void* x, const int x_inc, + void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT void PUBLIC_API cblas_sdot(const int n, @@ -173,29 +173,29 @@ void PUBLIC_API cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc); void PUBLIC_API cblas_hdot(const int n, - half* dot, - const half* x, const int x_inc, - const half* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu(const int n, - float2* dot, - const float2* x, const int x_inc, - const float2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); void PUBLIC_API cblas_zdotu(const int n, - double2* dot, - const double2* x, const int x_inc, - const double2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC void PUBLIC_API cblas_cdotc(const int n, - float2* dot, - const float2* x, const int x_inc, - const float2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); void PUBLIC_API cblas_zdotc(const int n, - double2* dot, - const double2* x, const int x_inc, - const double2* y, const int y_inc); + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 void PUBLIC_API cblas_snrm2(const int n, @@ -205,14 +205,14 @@ void PUBLIC_API cblas_dnrm2(const int n, double* nrm2, const double* x, const int x_inc); void PUBLIC_API cblas_scnrm2(const int n, - float2* nrm2, - const float2* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); void PUBLIC_API cblas_dznrm2(const int n, - double2* nrm2, - const double2* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); void PUBLIC_API cblas_hnrm2(const int n, - half* nrm2, - const half* x, const int x_inc); + void* nrm2, + const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM void PUBLIC_API cblas_sasum(const int n, @@ -222,14 +222,14 @@ void PUBLIC_API cblas_dasum(const int n, double* asum, const double* x, const int x_inc); void PUBLIC_API cblas_scasum(const int n, - float2* asum, - const float2* x, const int x_inc); + void* asum, + const void* x, const int x_inc); void PUBLIC_API cblas_dzasum(const int n, - double2* asum, - const double2* x, const int x_inc); + void* asum, + const void* x, const int x_inc); void PUBLIC_API cblas_hasum(const int n, - half* asum, - const half* x, const int x_inc); + void* asum, + const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM void PUBLIC_API cblas_ssum(const int n, @@ -239,14 +239,14 @@ void PUBLIC_API cblas_dsum(const int n, double* sum, const double* x, const int x_inc); void PUBLIC_API cblas_scsum(const int n, - float2* sum, - const float2* x, const int x_inc); + void* sum, + const void* x, const int x_inc); void PUBLIC_API cblas_dzsum(const int n, - double2* sum, - const double2* x, const int x_inc); + void* sum, + const void* x, const int x_inc); void PUBLIC_API cblas_hsum(const int n, - half* sum, - const half* x, const int x_inc); + void* sum, + const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void PUBLIC_API cblas_isamax(const int n, @@ -256,14 +256,14 @@ void PUBLIC_API cblas_idamax(const int n, double* imax, const double* x, const int x_inc); void PUBLIC_API cblas_icamax(const int n, - float2* imax, - const float2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_izamax(const int n, - double2* imax, - const double2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_ihamax(const int n, - half* imax, - const half* x, const int x_inc); + void* imax, + const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX void PUBLIC_API cblas_ismax(const int n, @@ -273,14 +273,14 @@ void PUBLIC_API cblas_idmax(const int n, double* imax, const double* x, const int x_inc); void PUBLIC_API cblas_icmax(const int n, - float2* imax, - const float2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_izmax(const int n, - double2* imax, - const double2* x, const int x_inc); + void* imax, + const void* x, const int x_inc); void PUBLIC_API cblas_ihmax(const int n, - half* imax, - const half* x, const int x_inc); + void* imax, + const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN void PUBLIC_API cblas_ismin(const int n, @@ -290,14 +290,14 @@ void PUBLIC_API cblas_idmin(const int n, double* imin, const double* x, const int x_inc); void PUBLIC_API cblas_icmin(const int n, - float2* imin, - const float2* x, const int x_inc); + void* imin, + const void* x, const int x_inc); void PUBLIC_API cblas_izmin(const int n, - double2* imin, - const double2* x, const int x_inc); + void* imin, + const void* x, const int x_inc); void PUBLIC_API cblas_ihmin(const int n, - half* imin, - const half* x, const int x_inc); + void* imin, + const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -321,24 +321,24 @@ void PUBLIC_API cblas_dgemv(const Layout layout, const Transpose a_transpose, void PUBLIC_API cblas_cgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, @@ -358,72 +358,72 @@ void PUBLIC_API cblas_dgbmv(const Layout layout, const Transpose a_transpose, void PUBLIC_API cblas_cgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV void PUBLIC_API cblas_chbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV void PUBLIC_API cblas_chpmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* ap, - const float2* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - float2* y, const int y_inc); + void* y, const int y_inc); void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* ap, - const double2* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - double2* y, const int y_inc); + void* y, const int y_inc); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV void PUBLIC_API cblas_ssymv(const Layout layout, const Triangle triangle, @@ -443,10 +443,10 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, @@ -466,10 +466,10 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* x, const int x_inc, + const void* a, const int a_ld, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, @@ -489,10 +489,10 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* ap, - const half* x, const int x_inc, + const void* ap, + const void* x, const int x_inc, const void* beta, - half* y, const int y_inc); + void* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -505,16 +505,16 @@ void PUBLIC_API cblas_dtrmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const half* a, const int a_ld, - half* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -527,16 +527,16 @@ void PUBLIC_API cblas_dtbmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const half* a, const int a_ld, - half* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -549,16 +549,16 @@ void PUBLIC_API cblas_dtpmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* ap, - float2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* ap, - double2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const half* ap, - half* x, const int x_inc); + const void* ap, + void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -571,12 +571,12 @@ void PUBLIC_API cblas_dtrsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV void PUBLIC_API cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -589,12 +589,12 @@ void PUBLIC_API cblas_dtbsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const float2* a, const int a_ld, - float2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); void PUBLIC_API cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, const int k, - const double2* a, const int a_ld, - double2* x, const int x_inc); + const void* a, const int a_ld, + void* x, const int x_inc); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV void PUBLIC_API cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -607,12 +607,12 @@ void PUBLIC_API cblas_dtpsv(const Layout layout, const Triangle triangle, const double* x, const int x_inc); void PUBLIC_API cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const float2* ap, - float2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); void PUBLIC_API cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int n, - const double2* ap, - double2* x, const int x_inc); + const void* ap, + void* x, const int x_inc); // General rank-1 matrix update: SGER/DGER/HGER void PUBLIC_API cblas_sger(const Layout layout, @@ -630,89 +630,89 @@ void PUBLIC_API cblas_dger(const Layout layout, void PUBLIC_API cblas_hger(const Layout layout, const int m, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const Layout layout, const int m, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zgeru(const Layout layout, const int m, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // General rank-1 complex conjugated matrix update: CGERC/ZGERC void PUBLIC_API cblas_cgerc(const Layout layout, const int m, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zgerc(const Layout layout, const int m, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Hermitian rank-1 matrix update: CHER/ZHER void PUBLIC_API cblas_cher(const Layout layout, const Triangle triangle, const int n, - const float alpha, - const float2* x, const int x_inc, - float2* a, const int a_ld); + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zher(const Layout layout, const Triangle triangle, const int n, - const double alpha, - const double2* x, const int x_inc, - double2* a, const int a_ld); + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR void PUBLIC_API cblas_chpr(const Layout layout, const Triangle triangle, const int n, - const float alpha, - const float2* x, const int x_inc, - float2* ap); + const void* alpha, + const void* x, const int x_inc, + void* ap); void PUBLIC_API cblas_zhpr(const Layout layout, const Triangle triangle, const int n, - const double alpha, - const double2* x, const int x_inc, - double2* ap); + const void* alpha, + const void* x, const int x_inc, + void* ap); // Hermitian rank-2 matrix update: CHER2/ZHER2 void PUBLIC_API cblas_cher2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 void PUBLIC_API cblas_chpr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const float2* x, const int x_inc, - const float2* y, const int y_inc, - float2* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const double2* x, const int x_inc, - const double2* y, const int y_inc, - double2* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR void PUBLIC_API cblas_ssyr(const Layout layout, const Triangle triangle, @@ -728,8 +728,8 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + void* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, @@ -745,8 +745,8 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - half* ap); + const void* x, const int x_inc, + void* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, @@ -764,9 +764,9 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* a, const int a_ld); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, @@ -784,9 +784,9 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, const int n, const void* alpha, - const half* x, const int x_inc, - const half* y, const int y_inc, - half* ap); + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -810,24 +810,24 @@ void PUBLIC_API cblas_dgemm(const Layout layout, const Transpose a_transpose, co void PUBLIC_API cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const int m, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, @@ -847,40 +847,40 @@ void PUBLIC_API cblas_dsymm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_csymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK void PUBLIC_API cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -898,35 +898,35 @@ void PUBLIC_API cblas_dsyrk(const Layout layout, const Triangle triangle, const void PUBLIC_API cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, const void* alpha, - const half* a, const int a_ld, + const void* a, const int a_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, - const float alpha, - const float2* a, const int a_ld, - const float beta, - float2* c, const int c_ld); + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const int n, const int k, - const double alpha, - const double2* a, const int a_ld, - const double beta, - double2* c, const int c_ld); + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K void PUBLIC_API cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -946,40 +946,40 @@ void PUBLIC_API cblas_dsyr2k(const Layout layout, const Triangle triangle, const void PUBLIC_API cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - float2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - double2* c, const int c_ld); + void* c, const int c_ld); void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const half* a, const int a_ld, - const half* b, const int b_ld, + const void* a, const int a_ld, + const void* b, const int b_ld, const void* beta, - half* c, const int c_ld); + void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const float2* a, const int a_ld, - const float2* b, const int b_ld, - const float beta, - float2* c, const int c_ld); + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const int n, const int k, const void* alpha, - const double2* a, const int a_ld, - const double2* b, const int b_ld, - const double beta, - double2* c, const int c_ld); + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM void PUBLIC_API cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -995,18 +995,18 @@ void PUBLIC_API cblas_dtrmm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1022,18 +1022,18 @@ void PUBLIC_API cblas_dtrsm(const Layout layout, const Side side, const Triangle void PUBLIC_API cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -1053,18 +1053,18 @@ void PUBLIC_API cblas_domatcopy(const Layout layout, const Transpose a_transpose void PUBLIC_API cblas_comatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const float2* a, const int a_ld, - float2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const double2* a, const int a_ld, - double2* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, const int m, const int n, const void* alpha, - const half* a, const int a_ld, - half* b, const int b_ld); + const void* a, const int a_ld, + void* b, const int b_ld); half* b, const size_t b_offset, const size_t b_ld); // ================================================================================================= diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 29acc744..01f32dd8 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -65,10 +65,9 @@ class DataType: return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2])) - def is_non_standard(self, scalar): - """Current scalar is of a non-standard type""" - return ((scalar == "alpha" and self.alpha_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2]) or - (scalar == "beta" and self.beta_cpp in [D_HALF, D_FLOAT2, D_DOUBLE2])) + def is_non_standard(self): + """Current type is of a non-standard type""" + return self.buffer_type in [D_HALF, D_FLOAT2, D_DOUBLE2] # Regular data-types diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 4870b861..126d64ce 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -182,7 +182,7 @@ class Routine: """As above but as plain C pointer""" prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: - data_type = "void" if flavour.is_non_standard(name) else flavour.buffer_type + data_type = "void" if flavour.is_non_standard() else flavour.buffer_type a = [prefix + data_type + "* " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] @@ -317,9 +317,9 @@ class Routine: """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" if name in self.scalars: if name == "alpha": - data_type = "void*" if flavour.is_non_standard(name) else flavour.alpha_cpp + data_type = "void*" if flavour.is_non_standard() else flavour.alpha_cpp return ["const " + data_type + " " + name] - data_type = "void*" if flavour.is_non_standard(name) else flavour.beta_cpp + data_type = "void*" if flavour.is_non_standard() else flavour.beta_cpp return ["const " + data_type + " " + name] return [] -- cgit v1.2.3 From f96fd372bc3087938572ebc55bd1d8e1b7e6f18a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 14:28:52 +0200 Subject: Added initial version of a Netlib CBLAS implementation. TODO: Set correct buffer sizes --- CMakeLists.txt | 2 + include/clblast_blas.h | 158 +- include/clblast_c.h | 5 - scripts/generator/generator.py | 108 +- scripts/generator/generator/cpp.py | 64 +- scripts/generator/generator/datatype.py | 16 + scripts/generator/generator/routine.py | 59 +- src/clblast_blas.cpp | 4651 +++++++++++++++++++++++++++++++ 8 files changed, 4817 insertions(+), 246 deletions(-) create mode 100644 src/clblast_blas.cpp (limited to 'scripts') diff --git a/CMakeLists.txt b/CMakeLists.txt index f5edbd75..d2034617 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ set(PRECISIONS 32 64 3232 6464 16) # Gathers all source-files set(SOURCES + src/clblast_blas.cpp src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp @@ -213,6 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) +install(FILES include/clblast_blas.h DESTINATION include) # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/include/clblast_blas.h b/include/clblast_blas.h index a5d0cc9c..b4db4192 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -18,8 +18,8 @@ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #ifdef COMPILING_DLL +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) #define PUBLIC_API __declspec(dllexport) #else #define PUBLIC_API __declspec(dllimport) @@ -42,6 +42,7 @@ typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; typedef enum Side_ { kLeft = 141, kRight = 142 } Side; + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -103,9 +104,6 @@ void PUBLIC_API cblas_cswap(const int n, void PUBLIC_API cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL void PUBLIC_API cblas_sscal(const int n, @@ -120,9 +118,6 @@ void PUBLIC_API cblas_cscal(const int n, void PUBLIC_API cblas_zscal(const int n, const void* alpha, void* x, const int x_inc); -void PUBLIC_API cblas_hscal(const int n, - const void* alpha, - void* x, const int x_inc); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY void PUBLIC_API cblas_scopy(const int n, @@ -137,9 +132,6 @@ void PUBLIC_API cblas_ccopy(const int n, void PUBLIC_API cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_hcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY void PUBLIC_API cblas_saxpy(const int n, @@ -158,10 +150,6 @@ void PUBLIC_API cblas_zaxpy(const int n, const void* alpha, const void* x, const int x_inc, void* y, const int y_inc); -void PUBLIC_API cblas_haxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT void PUBLIC_API cblas_sdot(const int n, @@ -172,10 +160,6 @@ void PUBLIC_API cblas_ddot(const int n, double* dot, const double* x, const int x_inc, const double* y, const int y_inc); -void PUBLIC_API cblas_hdot(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU void PUBLIC_API cblas_cdotu(const int n, @@ -210,9 +194,6 @@ void PUBLIC_API cblas_scnrm2(const int n, void PUBLIC_API cblas_dznrm2(const int n, void* nrm2, const void* x, const int x_inc); -void PUBLIC_API cblas_hnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM void PUBLIC_API cblas_sasum(const int n, @@ -227,9 +208,6 @@ void PUBLIC_API cblas_scasum(const int n, void PUBLIC_API cblas_dzasum(const int n, void* asum, const void* x, const int x_inc); -void PUBLIC_API cblas_hasum(const int n, - void* asum, - const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM void PUBLIC_API cblas_ssum(const int n, @@ -244,9 +222,6 @@ void PUBLIC_API cblas_scsum(const int n, void PUBLIC_API cblas_dzsum(const int n, void* sum, const void* x, const int x_inc); -void PUBLIC_API cblas_hsum(const int n, - void* sum, - const void* x, const int x_inc); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX void PUBLIC_API cblas_isamax(const int n, @@ -261,9 +236,6 @@ void PUBLIC_API cblas_icamax(const int n, void PUBLIC_API cblas_izamax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihamax(const int n, - void* imax, - const void* x, const int x_inc); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX void PUBLIC_API cblas_ismax(const int n, @@ -278,9 +250,6 @@ void PUBLIC_API cblas_icmax(const int n, void PUBLIC_API cblas_izmax(const int n, void* imax, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmax(const int n, - void* imax, - const void* x, const int x_inc); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN void PUBLIC_API cblas_ismin(const int n, @@ -295,9 +264,6 @@ void PUBLIC_API cblas_icmin(const int n, void PUBLIC_API cblas_izmin(const int n, void* imin, const void* x, const int x_inc); -void PUBLIC_API cblas_ihmin(const int n, - void* imin, - const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines @@ -332,13 +298,6 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgemv(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, @@ -369,13 +328,6 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_hgbmv(const Layout layout, const Transpose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, @@ -440,13 +392,6 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsymv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, @@ -463,13 +408,6 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hsbmv(const Layout layout, const Triangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, @@ -486,13 +424,6 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_hspmv(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -511,10 +442,6 @@ void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -533,10 +460,6 @@ void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_htbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -555,10 +478,6 @@ void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_htpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -627,12 +546,6 @@ void PUBLIC_API cblas_dger(const Layout layout, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hger(const Layout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU void PUBLIC_API cblas_cgeru(const Layout layout, @@ -725,11 +638,6 @@ void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, @@ -742,11 +650,6 @@ void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, const double alpha, const double* x, const int x_inc, double* ap); -void PUBLIC_API cblas_hspr(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - void* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, @@ -761,12 +664,6 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* a, const int a_ld); -void PUBLIC_API cblas_hsyr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, @@ -781,12 +678,6 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, const double* x, const int x_inc, const double* y, const int y_inc, double* ap); -void PUBLIC_API cblas_hspr2(const Layout layout, const Triangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines @@ -821,13 +712,6 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, @@ -858,13 +742,6 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsymm(const Layout layout, const Side side, const Triangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, @@ -907,12 +784,6 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, @@ -957,13 +828,6 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_hsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, @@ -1002,11 +866,6 @@ void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, @@ -1029,11 +888,6 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_htrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); // ================================================================================================= // Extra non-BLAS routines (level-X) @@ -1060,12 +914,6 @@ void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_homatcopy(const Layout layout, const Transpose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - half* b, const size_t b_offset, const size_t b_ld); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index 81f093cd..72f50d83 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -117,11 +117,6 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; -// Precision scoped enum (values in bits) -typedef enum CLBlastPrecision_ { CLBlastPrecisionHalf = 16, CLBlastPrecisionSingle = 32, - CLBlastPrecisionDouble = 64, CLBlastPrecisionComplexSingle = 3232, - CLBlastPrecisionComplexDouble = 6464 } CLBlastPrecision; - // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 220b314d..4ba97ff8 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,8 +41,8 @@ FILES = [ "/include/clblast_blas.h", "/src/clblast_blas.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 43, 1] -FOOTER_LINES = [17, 80, 19, 18, 6, 6, 10, 1] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32] +FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." @@ -64,65 +64,65 @@ cld_n = "The value of `c_ld` must be at least `n`." # Populates a list of routines ROUTINES = [ [ # Level 1: vector-vector - Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []), - Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), + Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 61730fdb..23a2207c 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -99,7 +99,8 @@ def clblast_blas_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: - result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + if flavour.precision_name in ["S", "D", "C", "Z"]: + result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL return result @@ -107,31 +108,44 @@ def clblast_blas_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: - template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (26 + routine.length() + len(template)) - result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL - - # Initialize OpenCL - result += " auto platform = Platform(size_t{0});" + NL - result += " auto device = Device(platform, size_t{0});" + NL - result += " auto context = Context(device);" + NL - result += " auto queue = Queue(context, device);" + NL - - # Copy data structures to the device - for name in routine.inputs + routine.outputs: - result += " " + routine.create_buffer(name, flavour.template, "0") + NL - for name in routine.inputs + routine.outputs: - result += " " + routine.write_buffer(name, "0") + NL - - # The function call - result += " auto status = clblast::" + routine.name.capitalize() + template + "(" - result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)]) - result += "," + NL + indent + "queue, event);" + NL - # Copy back and clean-up - for name in routine.outputs: - result += " " + routine.read_buffer(name, "0") + NL - result += " return;" + NL + "}" + NL + # There is a version available in CBLAS + if flavour.precision_name in ["S", "D", "C", "Z"]: + template = "<" + flavour.template + ">" if routine.no_scalars() else "" + indent = " " * (12 + routine.length() + len(template)) + result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + + # Initialize OpenCL + result += " auto device = get_device();" + NL + result += " auto context = Context(device);" + NL + result += " auto queue = Queue(context, device);" + NL + + # Set alpha and beta + result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) + + # Copy data structures to the device + for i, name in enumerate(routine.inputs + routine.outputs): + result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL + result += " " + routine.create_buffer(name, flavour.buffer_type) + NL + for name in routine.inputs + routine.outputs: + prefix = "" if name in routine.outputs else "const " + result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL + + # The function call + result += " auto queue_cl = queue();" + NL + result += " auto s = " + routine.name.capitalize() + template + "(" + result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) + result += "," + NL + indent + "&queue_cl);" + NL + + # Error handling + result += " if (s != StatusCode::kSuccess) {" + NL + result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL + result += " }" + NL + + # Copy back and clean-up + for name in routine.outputs: + result += " " + routine.read_buffer(name, flavour.buffer_type) + NL + result += "}" + NL return result diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py index 01f32dd8..98874174 100644 --- a/scripts/generator/generator/datatype.py +++ b/scripts/generator/generator/datatype.py @@ -54,6 +54,22 @@ class DataType: return self.beta_cl + "{{beta.real(), beta.imag()}}" return "beta" + def use_alpha_clblast(self): + """Transforms a Netlib CBLAS parameter to CLBlast style""" + if self.alpha_cpp == D_FLOAT2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + elif self.alpha_cpp == D_DOUBLE2: + return self.alpha_cpp + "{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}" + return "alpha" + + def use_beta_clblast(self): + """As above, but for beta instead of alpha""" + if self.beta_cpp == D_FLOAT2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + elif self.beta_cpp == D_DOUBLE2: + return self.beta_cpp + "{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}" + return "beta" + def test_template(self): """Returns the template as used in the correctness/performance tests""" if self.buffer_type != self.beta_cpp: diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 795fc532..b988c91a 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -13,7 +13,8 @@ import generator.convert as convert class Routine: """Class holding routine-specific information (e.g. name, which arguments, which precisions)""" def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options, - inputs, outputs, scalars, scratch, description, details, requirements): + inputs, outputs, buffer_sizes, scalars, scratch, + description, details, requirements): self.implemented = implemented self.has_tests = has_tests self.level = level @@ -24,6 +25,7 @@ class Routine: self.options = options self.inputs = inputs self.outputs = outputs + self.buffer_sizes = buffer_sizes self.scalars = scalars self.scratch = scratch # Scratch buffer (e.g. for xDOT) self.description = description @@ -66,19 +68,26 @@ class Routine: return ["a", "b", "c", "ap"] @staticmethod - def create_buffer(name, template, size): + def set_size(name, size): + """Sets the size of a buffer""" + return "const auto " + name + "_size = " + size + ";" + + @staticmethod + def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" - return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + size + ");" + return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);" @staticmethod - def write_buffer(name, size): + def write_buffer(name, template): """Writes to a CLCudaAPI buffer""" - return name + "_buffer.Write(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod - def read_buffer(name, size): + def read_buffer(name, template): """Reads from a CLCudaAPI buffer""" - return name + "_buffer.Read(queue, " + size + ", " + name + ");" + data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + return name + "_buffer.Read(queue, " + name + "_size, " + data_structure + ");" def non_index_inputs(self): """Lists of input/output buffers not index (integer)""" @@ -148,6 +157,15 @@ class Routine: return [", ".join(a + b + c)] return [] + def buffer_zero_offset(self, name): + """As above, but with an offset value of zero""" + if name in self.inputs or name in self.outputs: + a = [name + "_buffer()"] + b = ["0"] + c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else [] + return [", ".join(a + b + c)] + return [] + def buffer_def(self, name): """As above but with data-types""" prefix = "const " if name in self.inputs else "" @@ -263,6 +281,12 @@ class Routine: return [name] return [] + def scalar_cpp(self, name): + """As above, but with _cpp as a suffix""" + if name in self.scalars: + return [name + "_cpp"] + return [] + def scalar_half_to_float(self, name): """As above, but converts from float to half""" if name in self.scalars: @@ -339,6 +363,16 @@ class Routine: return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."] return [] + def scalar_create_cpp(self, flavour): + """Creates a C++ version of a scalar based on a void*""" + result = [] + for name in self.scalars: + if name == "alpha": + result.append("const auto alpha_cpp = " + flavour.use_alpha_clblast() + ";") + elif name == "beta": + result.append("const auto beta_cpp = " + flavour.use_beta_clblast() + ";") + return result + def sizes_list(self): """Retrieves a list of comma-separated sizes (m, n, k)""" if self.sizes: @@ -469,6 +503,17 @@ class Routine: list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()]))) + def arguments_netlib(self, flavour, indent): + """As above, but for the Netlib CBLAS API""" + return (self.options_cast(indent) + self.sizes_list() + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_first()])) + + self.scalar_cpp("alpha") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_first()])) + + self.scalar_cpp("beta") + + list(chain(*[self.buffer_zero_offset(b) for b in self.buffers_second()])) + + list(chain(*[self.buffer_zero_offset(b) for b in self.scalar_buffers_second()])) + + list(chain(*[self.scalar(s) for s in self.other_scalars()]))) + def arguments_wrapper_clblas(self, flavour): """As above, but for the clBLAS wrapper""" return (self.options_list() + self.sizes_list() + diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp new file mode 100644 index 00000000..286b1ba8 --- /dev/null +++ b/src/clblast_blas.cpp @@ -0,0 +1,4651 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#include + +#include "clblast_blas.h" +#include "clblast.h" +#include "utilities/utilities.hpp" + +namespace clblast { + +// ================================================================================================= + +// Helper function to get a default OpenCL platform and device +Device get_device() { + auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = Platform(platform_id); + return Device(platform, device_id); +} + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// ROTG +void cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} +void cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sa_size = 1; + auto sa_buffer = Buffer(context, sa_size); + const auto sb_size = 1; + auto sb_buffer = Buffer(context, sb_size); + const auto sc_size = 1; + auto sc_buffer = Buffer(context, sc_size); + const auto ss_size = 1; + auto ss_buffer = Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} + +// ROTMG +void cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto sy1_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + const auto sd1_size = 1; + auto sd1_buffer = Buffer(context, sd1_size); + const auto sd2_size = 1; + auto sd2_buffer = Buffer(context, sd2_size); + const auto sx1_size = 1; + auto sx1_buffer = Buffer(context, sx1_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// ROT +void cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// ROTM +void cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto sparam_size = 1; + auto sparam_buffer = Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// SWAP +void cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SCAL +void cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// COPY +void cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// AXPY +void cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// DOT +void cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTU +void cblas_cdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTC +void cblas_cdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto dot_size = 1; + auto dot_buffer = Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// NRM2 +void cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_scnrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dznrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto nrm2_size = 1; + auto nrm2_buffer = Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} + +// ASUM +void cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_scasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dzasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto asum_size = 1; + auto asum_buffer = Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} + +// SUM +void cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_scsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dzsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto sum_size = 1; + auto sum_buffer = Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} + +// AMAX +void cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MAX +void cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imax_size = 1; + auto imax_buffer = Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MIN +void cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_icmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_izmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto imin_size = 1; + auto imin_buffer = Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +void cblas_sgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgemv(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// GBMV +void cblas_sgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgbmv(const Layout layout, const Transpose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HEMV +void cblas_chemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhemv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HBMV +void cblas_chbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HPMV +void cblas_chpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhpmv(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SYMV +void cblas_ssymv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsymv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SBMV +void cblas_ssbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsbmv(const Layout layout, const Triangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SPMV +void cblas_sspmv(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dspmv(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// TRMV +void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBMV +void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPMV +void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TRSV +void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBSV +void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPSV +void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// GER +void cblas_sger(const Layout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dger(const Layout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERU +void cblas_cgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgeru(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERC +void cblas_cgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgerc(const Layout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HER +void cblas_cher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR +void cblas_chpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// HER2 +void cblas_cher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR2 +void cblas_chpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr2(const Layout layout, const Triangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR +void cblas_ssyr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR +void cblas_sspr(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR2 +void cblas_ssyr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR2 +void cblas_sspr2(const Layout layout, const Triangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr2(const Layout layout, const Triangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = Buffer(context, x_size); + const auto y_size = n; + auto y_buffer = Buffer(context, y_size); + const auto ap_size = n; + auto ap_buffer = Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// GEMM +void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYMM +void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HEMM +void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYRK +void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HERK +void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYR2K +void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HER2K +void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + const auto c_size = n; + auto c_buffer = Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// TRMM +void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// TRSM +void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +void cblas_somatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_domatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_comatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = Context(device); + auto queue = Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = n; + auto a_buffer = Buffer(context, a_size); + const auto b_size = n; + auto b_buffer = Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +} // namespace clblast -- cgit v1.2.3 From 59183b7d79b70d918562d5048e521633d425ca1c Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:21:49 +0200 Subject: Sets the proper sizes for the buffers for the Netlib CBLAS API --- scripts/generator/generator.py | 127 +++++++---- src/clblast_blas.cpp | 500 ++++++++++++++++++++--------------------- 2 files changed, 331 insertions(+), 296 deletions(-) (limited to 'scripts') diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 4ba97ff8..99edf355 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -59,6 +59,41 @@ bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at leas cld_m = "The value of `c_ld` must be at least `m`." cld_n = "The value of `c_ld` must be at least `n`." + +# Helper functions to compute vector and matrix sizes +def size_helper(condition, size_one, size_two, multiplier): + length = "(" + condition + ")" + " ? " + size_one + " * " + multiplier + " : " + size_two + " * " + multiplier + return length + + +def layout_transpose_condition(prefix): + return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\ + "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)" + + +# Different possibilities for the vector and matrix sizes +xn = "n * x_inc" +xm = "m * x_inc" +yn = "n * y_inc" +ym = "m * y_inc" +an = "n * a_ld" +apn = "((n*(n+1)) / 2)" +cn = "n * c_ld" +xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc") +ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc") +amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld") +amns = size_helper("side == Side::kLeft", "m", "n", "a_ld") +amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") +ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") +ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") +bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") +bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") +bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld") +bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") +cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld") +ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld") +bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld") + # ================================================================================================== # Populates a list of routines @@ -66,63 +101,63 @@ ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"], "", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []), Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector - Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), - Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), - Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), - Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), - Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), - Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["n","n","n"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), - Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), - Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), - Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a triangular system of equations", "", []), - Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], ["n","n"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), - Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], ["n","n"], [], "", "Solves a packed triangular system of equations", "", []), + Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), + Routine(True, True, "2a", "gbmv", T, [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]), + Routine(True, True, "2a", "hemv", T, [C,Z], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]), + Routine(True, True, "2a", "hbmv", T, [C,Z], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]), + Routine(True, True, "2a", "hpmv", T, [C,Z], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "symv", T, [S,D,H], ["n"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]), + Routine(True, True, "2a", "sbmv", T, [S,D,H], ["n","k"], ["layout","triangle"], ["a","x"], ["y"], [an,xn,yn], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "spmv", T, [S,D,H], ["n"], ["layout","triangle"], ["ap","x"], ["y"], [apn,xn,yn], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2a", "trmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]), + Routine(True, True, "2a", "tbmv", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]), + Routine(True, True, "2a", "tpmv", T, [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []), + Routine(False, True, "2a", "trsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a triangular system of equations", "", []), + Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [an,xn], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]), + Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [apn,xn], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["n","n"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["n","n"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), - Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["n","n","n"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), - Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["n","n","n"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], [xm,yn,amn], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], [xn,an], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], [xn,apn], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], [xn,yn,an], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], [xn,yn,apn], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []), ], [ # Level 3: matrix-matrix - Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), - Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), - Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["n","n"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), - Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["n","n","n"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), - Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), - Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Solves a triangular system of equations", "", []), + Routine(True, True, "3", "gemm", T, [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], [amk,bkn,cmn], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]), + Routine(True, True, "3", "symm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "hemm", T, [C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], [ammn,bmnn,cmn], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]), + Routine(True, True, "3", "syrk", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "herk", Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], [ank,cn], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]), + Routine(True, True, "3", "syr2k", T, [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], [ankab,bnkab,cn],["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]), + Routine(True, True, "3", "trmm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]), + Routine(False, True, "3", "trsm", T, [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], [amns,bmn], ["alpha"], "", "Solves a triangular system of equations", "", []), ], [ # Level X: extra routines (not part of BLAS) - Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["n","n"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), + Routine(True, True, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), ]] diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 286b1ba8..b5451049 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1682,11 +1682,11 @@ void cblas_chemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1718,11 +1718,11 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1756,11 +1756,11 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1792,11 +1792,11 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1830,11 +1830,11 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1866,11 +1866,11 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1904,11 +1904,11 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1940,11 +1940,11 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1978,11 +1978,11 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2014,11 +2014,11 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2052,11 +2052,11 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2088,11 +2088,11 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2121,9 +2121,9 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2148,9 +2148,9 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2175,9 +2175,9 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2202,9 +2202,9 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2231,9 +2231,9 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2258,9 +2258,9 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2285,9 +2285,9 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2312,9 +2312,9 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2341,9 +2341,9 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2368,9 +2368,9 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2395,9 +2395,9 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2422,9 +2422,9 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2451,9 +2451,9 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2478,9 +2478,9 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2505,9 +2505,9 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2532,9 +2532,9 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2561,9 +2561,9 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2588,9 +2588,9 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2615,9 +2615,9 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2642,9 +2642,9 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2671,9 +2671,9 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2698,9 +2698,9 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2725,9 +2725,9 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2752,9 +2752,9 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto device = get_device(); auto context = Context(device); auto queue = Queue(context, device); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2784,11 +2784,11 @@ void cblas_sger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2816,11 +2816,11 @@ void cblas_dger(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2850,11 +2850,11 @@ void cblas_cgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2882,11 +2882,11 @@ void cblas_zgeru(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2916,11 +2916,11 @@ void cblas_cgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2948,11 +2948,11 @@ void cblas_zgerc(const Layout layout, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = m * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2981,9 +2981,9 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3009,9 +3009,9 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3039,9 +3039,9 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3067,9 +3067,9 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3098,11 +3098,11 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3131,11 +3131,11 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3166,11 +3166,11 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3199,11 +3199,11 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3233,9 +3233,9 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3261,9 +3261,9 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3291,9 +3291,9 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3319,9 +3319,9 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3350,11 +3350,11 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3383,11 +3383,11 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto a_size = n; + const auto a_size = n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3418,11 +3418,11 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3451,11 +3451,11 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = n; + const auto y_size = n * y_inc; auto y_buffer = Buffer(context, y_size); - const auto ap_size = n; + const auto ap_size = ((n*(n+1)) / 2); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = n; + const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n; + const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = n; + const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = n; + const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); -- cgit v1.2.3 From 926aca53a0de9250a9f7d42026fb54995668dc5b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:45:57 +0200 Subject: Made the Netlib CBLAS API use the same enums with prefixes as the regular C API of CLBlast --- include/clblast_blas.h | 215 +++++----- scripts/generator/generator.py | 22 +- scripts/generator/generator/cpp.py | 1 + scripts/generator/generator/routine.py | 2 +- src/clblast_blas.cpp | 720 ++++++++++++++++----------------- 5 files changed, 482 insertions(+), 478 deletions(-) (limited to 'scripts') diff --git a/include/clblast_blas.h b/include/clblast_blas.h index b4db4192..927f84cd 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -36,12 +36,15 @@ extern "C" { // ================================================================================================= // Matrix layout and transpose types -typedef enum Layout_ { kRowMajor = 101, kColMajor = 102 } Layout; -typedef enum Transpose_ { kNo = 111, kYes = 112, kConjugate = 113 } Transpose; -typedef enum Triangle_ { kUpper = 121, kLower = 122 } Triangle; -typedef enum Diagonal_ { kNonUnit = 131, kUnit = 132 } Diagonal; -typedef enum Side_ { kLeft = 141, kRight = 142 } Side; - +typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, + CLBlastLayoutColMajor = 102 } CLBlastLayout; +typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, + CLBlastTransposeConjugate = 113 } CLBlastTranspose; +typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, + CLBlastTriangleLower = 122 } CLBlastTriangle; +typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, + CLBlastDiagonalUnit = 132 } CLBlastDiagonal; +typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; // ================================================================================================= // BLAS level-1 (vector-vector) routines @@ -270,28 +273,28 @@ void PUBLIC_API cblas_izmin(const int n, // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -void PUBLIC_API cblas_sgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_cgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -300,28 +303,28 @@ void PUBLIC_API cblas_zgemv(const Layout layout, const Transpose a_transpose, void* y, const int y_inc); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -void PUBLIC_API cblas_sgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, const double* x, const int x_inc, const double beta, double* y, const int y_inc); -void PUBLIC_API cblas_cgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -330,14 +333,14 @@ void PUBLIC_API cblas_zgbmv(const Layout layout, const Transpose a_transpose, void* y, const int y_inc); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV -void PUBLIC_API cblas_chemv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -346,14 +349,14 @@ void PUBLIC_API cblas_zhemv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -void PUBLIC_API cblas_chbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -362,14 +365,14 @@ void PUBLIC_API cblas_zhbmv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -void PUBLIC_API cblas_chpmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, const void* x, const int x_inc, const void* beta, void* y, const int y_inc); -void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -378,14 +381,14 @@ void PUBLIC_API cblas_zhpmv(const Layout layout, const Triangle triangle, void* y, const int y_inc); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -void PUBLIC_API cblas_ssymv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, @@ -394,14 +397,14 @@ void PUBLIC_API cblas_dsymv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -void PUBLIC_API cblas_ssbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -410,14 +413,14 @@ void PUBLIC_API cblas_dsbmv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -void PUBLIC_API cblas_sspmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, const float* x, const int x_inc, const float beta, float* y, const int y_inc); -void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, @@ -426,121 +429,121 @@ void PUBLIC_API cblas_dspmv(const Layout layout, const Triangle triangle, double* y, const int y_inc); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -void PUBLIC_API cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -void PUBLIC_API cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -void PUBLIC_API cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); -void PUBLIC_API cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); -void PUBLIC_API cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -void PUBLIC_API cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -void PUBLIC_API cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc); -void PUBLIC_API cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc); -void PUBLIC_API cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); -void PUBLIC_API cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -void PUBLIC_API cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc); -void PUBLIC_API cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc); -void PUBLIC_API cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); -void PUBLIC_API cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc); // General rank-1 matrix update: SGER/DGER/HGER -void PUBLIC_API cblas_sger(const Layout layout, +void PUBLIC_API cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dger(const Layout layout, +void PUBLIC_API cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, @@ -548,13 +551,13 @@ void PUBLIC_API cblas_dger(const Layout layout, double* a, const int a_ld); // General rank-1 complex matrix update: CGERU/ZGERU -void PUBLIC_API cblas_cgeru(const Layout layout, +void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zgeru(const Layout layout, +void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -562,13 +565,13 @@ void PUBLIC_API cblas_zgeru(const Layout layout, void* a, const int a_ld); // General rank-1 complex conjugated matrix update: CGERC/ZGERC -void PUBLIC_API cblas_cgerc(const Layout layout, +void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zgerc(const Layout layout, +void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -576,37 +579,37 @@ void PUBLIC_API cblas_zgerc(const Layout layout, void* a, const int a_ld); // Hermitian rank-1 matrix update: CHER/ZHER -void PUBLIC_API cblas_cher(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zher(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR -void PUBLIC_API cblas_chpr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* ap); -void PUBLIC_API cblas_zhpr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, void* ap); // Hermitian rank-2 matrix update: CHER2/ZHER2 -void PUBLIC_API cblas_cher2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* a, const int a_ld); -void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -614,13 +617,13 @@ void PUBLIC_API cblas_zher2(const Layout layout, const Triangle triangle, void* a, const int a_ld); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -void PUBLIC_API cblas_chpr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, const void* y, const int y_inc, void* ap); -void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -628,37 +631,37 @@ void PUBLIC_API cblas_zhpr2(const Layout layout, const Triangle triangle, void* ap); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -void PUBLIC_API cblas_ssyr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dsyr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* a, const int a_ld); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -void PUBLIC_API cblas_sspr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, float* ap); -void PUBLIC_API cblas_dspr(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, double* ap); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -void PUBLIC_API cblas_ssyr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* a, const int a_ld); -void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -666,13 +669,13 @@ void PUBLIC_API cblas_dsyr2(const Layout layout, const Triangle triangle, double* a, const int a_ld); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -void PUBLIC_API cblas_sspr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, const float* y, const int y_inc, float* ap); -void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, +void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -684,28 +687,28 @@ void PUBLIC_API cblas_dspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -void PUBLIC_API cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -714,28 +717,28 @@ void PUBLIC_API cblas_zgemm(const Layout layout, const Transpose a_transpose, co void* c, const int c_ld); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -void PUBLIC_API cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -744,14 +747,14 @@ void PUBLIC_API cblas_zsymm(const Layout layout, const Side side, const Triangle void* c, const int c_ld); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -void PUBLIC_API cblas_chemm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, +void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -760,25 +763,25 @@ void PUBLIC_API cblas_zhemm(const Layout layout, const Side side, const Triangle void* c, const int c_ld); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -void PUBLIC_API cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -786,13 +789,13 @@ void PUBLIC_API cblas_zsyrk(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-K update of a hermitian matrix: CHERK/ZHERK -void PUBLIC_API cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -800,28 +803,28 @@ void PUBLIC_API cblas_zherk(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -void PUBLIC_API cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, const float* b, const int b_ld, const float beta, float* c, const int c_ld); -void PUBLIC_API cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, const double* b, const int b_ld, const double beta, double* c, const int c_ld); -void PUBLIC_API cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -830,14 +833,14 @@ void PUBLIC_API cblas_zsyr2k(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -void PUBLIC_API cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, const void* beta, void* c, const int c_ld); -void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -846,44 +849,44 @@ void PUBLIC_API cblas_zher2k(const Layout layout, const Triangle triangle, const void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -void PUBLIC_API cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM -void PUBLIC_API cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -894,22 +897,22 @@ void PUBLIC_API cblas_ztrsm(const Layout layout, const Side side, const Triangle // ================================================================================================= // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -void PUBLIC_API cblas_somatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, float* b, const int b_ld); -void PUBLIC_API cblas_domatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, double* b, const int b_ld); -void PUBLIC_API cblas_comatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, void* b, const int b_ld); -void PUBLIC_API cblas_zomatcopy(const Layout layout, const Transpose a_transpose, +void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 99edf355..a9169872 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_blas.h", "/src/clblast_blas.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 44, 32] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] # Different possibilities for requirements @@ -67,8 +67,8 @@ def size_helper(condition, size_one, size_two, multiplier): def layout_transpose_condition(prefix): - return "(layout == Layout::kColMajor && " + prefix + "_transpose != Transpose::kNo) || " +\ - "(layout == Layout::kRowMajor && " + prefix + "_transpose == Transpose::kNo)" + return "(layout == CLBlastLayoutColMajor && " + prefix + "_transpose != CLBlastTransposeNo) || " +\ + "(layout == CLBlastLayoutRowMajor && " + prefix + "_transpose == CLBlastTransposeNo)" # Different possibilities for the vector and matrix sizes @@ -79,20 +79,20 @@ ym = "m * y_inc" an = "n * a_ld" apn = "((n*(n+1)) / 2)" cn = "n * c_ld" -xmn = size_helper("a_transpose != Transpose::kNo", "m", "n", "x_inc") -ynm = size_helper("a_transpose != Transpose::kNo", "n", "m", "y_inc") -amn = size_helper("layout == Layout::kRowMajor", "m", "n", "a_ld") -amns = size_helper("side == Side::kLeft", "m", "n", "a_ld") +xmn = size_helper("a_transpose != CLBlastTransposeNo", "m", "n", "x_inc") +ynm = size_helper("a_transpose != CLBlastTransposeNo", "n", "m", "y_inc") +amn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "a_ld") +amns = size_helper("side == CLBlastSideLeft", "m", "n", "a_ld") amk = size_helper(layout_transpose_condition("a"), "m", "k", "a_ld") ank = size_helper(layout_transpose_condition("a"), "n", "k", "a_ld") ankab = size_helper(layout_transpose_condition("ab"), "n", "k", "a_ld") bkn = size_helper(layout_transpose_condition("b"), "k", "n", "b_ld") bnkab = size_helper(layout_transpose_condition("ab"), "n", "k", "b_ld") -bmn = size_helper("layout == Layout::kRowMajor", "m", "n", "b_ld") +bmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "b_ld") bnma = size_helper(layout_transpose_condition("a"), "n", "m", "b_ld") -cmn = size_helper("layout == Layout::kRowMajor", "m", "n", "c_ld") -ammn = size_helper("layout == Layout::kRowMajor", "m", "((side == Side::kLeft) ? m : n)", "a_ld") -bmnn = size_helper("layout == Layout::kRowMajor", "((side == Side::kLeft) ? m : n)", "n", "b_ld") +cmn = size_helper("layout == CLBlastLayoutRowMajor", "m", "n", "c_ld") +ammn = size_helper("layout == CLBlastLayoutRowMajor", "m", "((side == CLBlastSideLeft) ? m : n)", "a_ld") +bmnn = size_helper("layout == CLBlastLayoutRowMajor", "((side == CLBlastSideLeft) ? m : n)", "n", "b_ld") # ================================================================================================== diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 23a2207c..eafbea30 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -126,6 +126,7 @@ def clblast_blas_cc(routine): # Copy data structures to the device for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL + for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.create_buffer(name, flavour.buffer_type) + NL for name in routine.inputs + routine.outputs: prefix = "" if name in routine.outputs else "const " diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index b988c91a..c35f5b4c 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -548,7 +548,7 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" - return (self.options_def() + self.sizes_def_netlib() + + return (self.options_def_c() + self.sizes_def_netlib() + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index b5451049..974dc21a 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -44,12 +44,12 @@ void cblas_srotg(float* sa, auto context = Context(device); auto queue = Queue(context, device); const auto sa_size = 1; - auto sa_buffer = Buffer(context, sa_size); const auto sb_size = 1; - auto sb_buffer = Buffer(context, sb_size); const auto sc_size = 1; - auto sc_buffer = Buffer(context, sc_size); const auto ss_size = 1; + auto sa_buffer = Buffer(context, sa_size); + auto sb_buffer = Buffer(context, sb_size); + auto sc_buffer = Buffer(context, sc_size); auto ss_buffer = Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); @@ -77,12 +77,12 @@ void cblas_drotg(double* sa, auto context = Context(device); auto queue = Queue(context, device); const auto sa_size = 1; - auto sa_buffer = Buffer(context, sa_size); const auto sb_size = 1; - auto sb_buffer = Buffer(context, sb_size); const auto sc_size = 1; - auto sc_buffer = Buffer(context, sc_size); const auto ss_size = 1; + auto sa_buffer = Buffer(context, sa_size); + auto sb_buffer = Buffer(context, sb_size); + auto sc_buffer = Buffer(context, sc_size); auto ss_buffer = Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); @@ -113,14 +113,14 @@ void cblas_srotmg(float* sd1, auto context = Context(device); auto queue = Queue(context, device); const auto sy1_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); const auto sd1_size = 1; - auto sd1_buffer = Buffer(context, sd1_size); const auto sd2_size = 1; - auto sd2_buffer = Buffer(context, sd2_size); const auto sx1_size = 1; - auto sx1_buffer = Buffer(context, sx1_size); const auto sparam_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + auto sd1_buffer = Buffer(context, sd1_size); + auto sd2_buffer = Buffer(context, sd2_size); + auto sx1_buffer = Buffer(context, sx1_size); auto sparam_buffer = Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); @@ -151,14 +151,14 @@ void cblas_drotmg(double* sd1, auto context = Context(device); auto queue = Queue(context, device); const auto sy1_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); const auto sd1_size = 1; - auto sd1_buffer = Buffer(context, sd1_size); const auto sd2_size = 1; - auto sd2_buffer = Buffer(context, sd2_size); const auto sx1_size = 1; - auto sx1_buffer = Buffer(context, sx1_size); const auto sparam_size = 1; + auto sy1_buffer = Buffer(context, sy1_size); + auto sd1_buffer = Buffer(context, sd1_size); + auto sd2_buffer = Buffer(context, sd2_size); + auto sx1_buffer = Buffer(context, sx1_size); auto sparam_buffer = Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); @@ -191,8 +191,8 @@ void cblas_srot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -218,8 +218,8 @@ void cblas_drot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -246,10 +246,10 @@ void cblas_srotm(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto sparam_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto sparam_buffer = Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -275,10 +275,10 @@ void cblas_drotm(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto sparam_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto sparam_buffer = Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -305,8 +305,8 @@ void cblas_sswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -328,8 +328,8 @@ void cblas_dswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -351,8 +351,8 @@ void cblas_cswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -374,8 +374,8 @@ void cblas_zswap(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -481,8 +481,8 @@ void cblas_scopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -503,8 +503,8 @@ void cblas_dcopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -525,8 +525,8 @@ void cblas_ccopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -547,8 +547,8 @@ void cblas_zcopy(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -573,8 +573,8 @@ void cblas_saxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -598,8 +598,8 @@ void cblas_daxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -623,8 +623,8 @@ void cblas_caxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -648,8 +648,8 @@ void cblas_zaxpy(const int n, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -674,10 +674,10 @@ void cblas_sdot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -701,10 +701,10 @@ void cblas_ddot(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -730,10 +730,10 @@ void cblas_cdotu(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -757,10 +757,10 @@ void cblas_zdotu(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -786,10 +786,10 @@ void cblas_cdotc(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -813,10 +813,10 @@ void cblas_zdotc(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto y_size = n; - auto y_buffer = Buffer(context, y_size); const auto dot_size = 1; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto dot_buffer = Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -841,8 +841,8 @@ void cblas_snrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -863,8 +863,8 @@ void cblas_dnrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -885,8 +885,8 @@ void cblas_scnrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -907,8 +907,8 @@ void cblas_dznrm2(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto nrm2_size = 1; + auto x_buffer = Buffer(context, x_size); auto nrm2_buffer = Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); @@ -931,8 +931,8 @@ void cblas_sasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -953,8 +953,8 @@ void cblas_dasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -975,8 +975,8 @@ void cblas_scasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -997,8 +997,8 @@ void cblas_dzasum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto asum_size = 1; + auto x_buffer = Buffer(context, x_size); auto asum_buffer = Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); @@ -1021,8 +1021,8 @@ void cblas_ssum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1043,8 +1043,8 @@ void cblas_dsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1065,8 +1065,8 @@ void cblas_scsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1087,8 +1087,8 @@ void cblas_dzsum(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto sum_size = 1; + auto x_buffer = Buffer(context, x_size); auto sum_buffer = Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); @@ -1111,8 +1111,8 @@ void cblas_isamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1133,8 +1133,8 @@ void cblas_idamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1155,8 +1155,8 @@ void cblas_icamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1177,8 +1177,8 @@ void cblas_izamax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1201,8 +1201,8 @@ void cblas_ismax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1223,8 +1223,8 @@ void cblas_idmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1245,8 +1245,8 @@ void cblas_icmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1267,8 +1267,8 @@ void cblas_izmax(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imax_size = 1; + auto x_buffer = Buffer(context, x_size); auto imax_buffer = Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); @@ -1291,8 +1291,8 @@ void cblas_ismin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1313,8 +1313,8 @@ void cblas_idmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1335,8 +1335,8 @@ void cblas_icmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1357,8 +1357,8 @@ void cblas_izmin(const int n, auto context = Context(device); auto queue = Queue(context, device); const auto x_size = n; - auto x_buffer = Buffer(context, x_size); const auto imin_size = 1; + auto x_buffer = Buffer(context, x_size); auto imin_buffer = Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); @@ -1378,7 +1378,7 @@ void cblas_izmin(const int n, // ================================================================================================= // GEMV -void cblas_sgemv(const Layout layout, const Transpose a_transpose, +void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -1390,11 +1390,11 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1414,7 +1414,7 @@ void cblas_sgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dgemv(const Layout layout, const Transpose a_transpose, +void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -1426,11 +1426,11 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1450,7 +1450,7 @@ void cblas_dgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_cgemv(const Layout layout, const Transpose a_transpose, +void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -1462,11 +1462,11 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1486,7 +1486,7 @@ void cblas_cgemv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zgemv(const Layout layout, const Transpose a_transpose, +void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -1498,11 +1498,11 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1524,7 +1524,7 @@ void cblas_zgemv(const Layout layout, const Transpose a_transpose, } // GBMV -void cblas_sgbmv(const Layout layout, const Transpose a_transpose, +void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const float alpha, const float* a, const int a_ld, @@ -1536,11 +1536,11 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1560,7 +1560,7 @@ void cblas_sgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dgbmv(const Layout layout, const Transpose a_transpose, +void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const double alpha, const double* a, const int a_ld, @@ -1572,11 +1572,11 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1596,7 +1596,7 @@ void cblas_dgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_cgbmv(const Layout layout, const Transpose a_transpose, +void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -1608,11 +1608,11 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1632,7 +1632,7 @@ void cblas_cgbmv(const Layout layout, const Transpose a_transpose, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zgbmv(const Layout layout, const Transpose a_transpose, +void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const int kl, const int ku, const void* alpha, const void* a, const int a_ld, @@ -1644,11 +1644,11 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; auto a_buffer = Buffer(context, a_size); - const auto x_size = (a_transpose != Transpose::kNo) ? m * x_inc : n * x_inc; auto x_buffer = Buffer(context, x_size); - const auto y_size = (a_transpose != Transpose::kNo) ? n * y_inc : m * y_inc; auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1670,7 +1670,7 @@ void cblas_zgbmv(const Layout layout, const Transpose a_transpose, } // HEMV -void cblas_chemv(const Layout layout, const Triangle triangle, +void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -1683,10 +1683,10 @@ void cblas_chemv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1706,7 +1706,7 @@ void cblas_chemv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhemv(const Layout layout, const Triangle triangle, +void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* a, const int a_ld, @@ -1719,10 +1719,10 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1744,7 +1744,7 @@ void cblas_zhemv(const Layout layout, const Triangle triangle, } // HBMV -void cblas_chbmv(const Layout layout, const Triangle triangle, +void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -1757,10 +1757,10 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1780,7 +1780,7 @@ void cblas_chbmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhbmv(const Layout layout, const Triangle triangle, +void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -1793,10 +1793,10 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1818,7 +1818,7 @@ void cblas_zhbmv(const Layout layout, const Triangle triangle, } // HPMV -void cblas_chpmv(const Layout layout, const Triangle triangle, +void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -1831,10 +1831,10 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1854,7 +1854,7 @@ void cblas_chpmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_zhpmv(const Layout layout, const Triangle triangle, +void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* ap, @@ -1867,10 +1867,10 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1892,7 +1892,7 @@ void cblas_zhpmv(const Layout layout, const Triangle triangle, } // SYMV -void cblas_ssymv(const Layout layout, const Triangle triangle, +void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* a, const int a_ld, @@ -1905,10 +1905,10 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1928,7 +1928,7 @@ void cblas_ssymv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dsymv(const Layout layout, const Triangle triangle, +void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* a, const int a_ld, @@ -1941,10 +1941,10 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -1966,7 +1966,7 @@ void cblas_dsymv(const Layout layout, const Triangle triangle, } // SBMV -void cblas_ssbmv(const Layout layout, const Triangle triangle, +void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -1979,10 +1979,10 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2002,7 +2002,7 @@ void cblas_ssbmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dsbmv(const Layout layout, const Triangle triangle, +void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -2015,10 +2015,10 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto a_buffer = Buffer(context, a_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2040,7 +2040,7 @@ void cblas_dsbmv(const Layout layout, const Triangle triangle, } // SPMV -void cblas_sspmv(const Layout layout, const Triangle triangle, +void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* ap, @@ -2053,10 +2053,10 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2076,7 +2076,7 @@ void cblas_sspmv(const Layout layout, const Triangle triangle, } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } -void cblas_dspmv(const Layout layout, const Triangle triangle, +void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* ap, @@ -2089,10 +2089,10 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2114,7 +2114,7 @@ void cblas_dspmv(const Layout layout, const Triangle triangle, } // TRMV -void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2122,8 +2122,8 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2141,7 +2141,7 @@ void cblas_strmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2149,8 +2149,8 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2168,7 +2168,7 @@ void cblas_dtrmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2176,8 +2176,8 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2195,7 +2195,7 @@ void cblas_ctrmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2203,8 +2203,8 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2224,7 +2224,7 @@ void cblas_ztrmv(const Layout layout, const Triangle triangle, const Transpose a } // TBMV -void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2232,8 +2232,8 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2251,7 +2251,7 @@ void cblas_stbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2259,8 +2259,8 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2278,7 +2278,7 @@ void cblas_dtbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2286,8 +2286,8 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2305,7 +2305,7 @@ void cblas_ctbmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2313,8 +2313,8 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2334,7 +2334,7 @@ void cblas_ztbmv(const Layout layout, const Triangle triangle, const Transpose a } // TPMV -void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { @@ -2342,8 +2342,8 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2361,7 +2361,7 @@ void cblas_stpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { @@ -2369,8 +2369,8 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2388,7 +2388,7 @@ void cblas_dtpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2396,8 +2396,8 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2415,7 +2415,7 @@ void cblas_ctpmv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2423,8 +2423,8 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2444,7 +2444,7 @@ void cblas_ztpmv(const Layout layout, const Triangle triangle, const Transpose a } // TRSV -void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2452,8 +2452,8 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2471,7 +2471,7 @@ void cblas_strsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2479,8 +2479,8 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2498,7 +2498,7 @@ void cblas_dtrsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2506,8 +2506,8 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2525,7 +2525,7 @@ void cblas_ctrsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2533,8 +2533,8 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2554,7 +2554,7 @@ void cblas_ztrsv(const Layout layout, const Triangle triangle, const Transpose a } // TBSV -void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const float* a, const int a_ld, float* x, const int x_inc) { @@ -2562,8 +2562,8 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2581,7 +2581,7 @@ void cblas_stbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const double* a, const int a_ld, double* x, const int x_inc) { @@ -2589,8 +2589,8 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2608,7 +2608,7 @@ void cblas_dtbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2616,8 +2616,8 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2635,7 +2635,7 @@ void cblas_ctbsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const int k, const void* a, const int a_ld, void* x, const int x_inc) { @@ -2643,8 +2643,8 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto a_size = n * a_ld; - auto a_buffer = Buffer(context, a_size); const auto x_size = n * x_inc; + auto a_buffer = Buffer(context, a_size); auto x_buffer = Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2664,7 +2664,7 @@ void cblas_ztbsv(const Layout layout, const Triangle triangle, const Transpose a } // TPSV -void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const float* ap, float* x, const int x_inc) { @@ -2672,8 +2672,8 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2691,7 +2691,7 @@ void cblas_stpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const double* ap, double* x, const int x_inc) { @@ -2699,8 +2699,8 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2718,7 +2718,7 @@ void cblas_dtpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2726,8 +2726,8 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2745,7 +2745,7 @@ void cblas_ctpsv(const Layout layout, const Triangle triangle, const Transpose a } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } -void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int n, const void* ap, void* x, const int x_inc) { @@ -2753,8 +2753,8 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a auto context = Context(device); auto queue = Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); - auto ap_buffer = Buffer(context, ap_size); const auto x_size = n * x_inc; + auto ap_buffer = Buffer(context, ap_size); auto x_buffer = Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -2774,7 +2774,7 @@ void cblas_ztpsv(const Layout layout, const Triangle triangle, const Transpose a } // GER -void cblas_sger(const Layout layout, +void cblas_sger(const CLBlastLayout layout, const int m, const int n, const float alpha, const float* x, const int x_inc, @@ -2785,10 +2785,10 @@ void cblas_sger(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2806,7 +2806,7 @@ void cblas_sger(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dger(const Layout layout, +void cblas_dger(const CLBlastLayout layout, const int m, const int n, const double alpha, const double* x, const int x_inc, @@ -2817,10 +2817,10 @@ void cblas_dger(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2840,7 +2840,7 @@ void cblas_dger(const Layout layout, } // GERU -void cblas_cgeru(const Layout layout, +void cblas_cgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2851,10 +2851,10 @@ void cblas_cgeru(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2872,7 +2872,7 @@ void cblas_cgeru(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zgeru(const Layout layout, +void cblas_zgeru(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2883,10 +2883,10 @@ void cblas_zgeru(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2906,7 +2906,7 @@ void cblas_zgeru(const Layout layout, } // GERC -void cblas_cgerc(const Layout layout, +void cblas_cgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2917,10 +2917,10 @@ void cblas_cgerc(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2938,7 +2938,7 @@ void cblas_cgerc(const Layout layout, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zgerc(const Layout layout, +void cblas_zgerc(const CLBlastLayout layout, const int m, const int n, const void* alpha, const void* x, const int x_inc, @@ -2949,10 +2949,10 @@ void cblas_zgerc(const Layout layout, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = Buffer(context, x_size); auto y_buffer = Buffer(context, y_size); - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -2972,7 +2972,7 @@ void cblas_zgerc(const Layout layout, } // HER -void cblas_cher(const Layout layout, const Triangle triangle, +void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -2982,8 +2982,8 @@ void cblas_cher(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3000,7 +3000,7 @@ void cblas_cher(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zher(const Layout layout, const Triangle triangle, +void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3010,8 +3010,8 @@ void cblas_zher(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3030,7 +3030,7 @@ void cblas_zher(const Layout layout, const Triangle triangle, } // HPR -void cblas_chpr(const Layout layout, const Triangle triangle, +void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3040,8 +3040,8 @@ void cblas_chpr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3058,7 +3058,7 @@ void cblas_chpr(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_zhpr(const Layout layout, const Triangle triangle, +void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3068,8 +3068,8 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3088,7 +3088,7 @@ void cblas_zhpr(const Layout layout, const Triangle triangle, } // HER2 -void cblas_cher2(const Layout layout, const Triangle triangle, +void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3099,10 +3099,10 @@ void cblas_cher2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3121,7 +3121,7 @@ void cblas_cher2(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_zher2(const Layout layout, const Triangle triangle, +void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3132,10 +3132,10 @@ void cblas_zher2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3156,7 +3156,7 @@ void cblas_zher2(const Layout layout, const Triangle triangle, } // HPR2 -void cblas_chpr2(const Layout layout, const Triangle triangle, +void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3167,10 +3167,10 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3189,7 +3189,7 @@ void cblas_chpr2(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_zhpr2(const Layout layout, const Triangle triangle, +void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const void* alpha, const void* x, const int x_inc, @@ -3200,10 +3200,10 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3224,7 +3224,7 @@ void cblas_zhpr2(const Layout layout, const Triangle triangle, } // SYR -void cblas_ssyr(const Layout layout, const Triangle triangle, +void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3234,8 +3234,8 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3252,7 +3252,7 @@ void cblas_ssyr(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dsyr(const Layout layout, const Triangle triangle, +void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3262,8 +3262,8 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); @@ -3282,7 +3282,7 @@ void cblas_dsyr(const Layout layout, const Triangle triangle, } // SPR -void cblas_sspr(const Layout layout, const Triangle triangle, +void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3292,8 +3292,8 @@ void cblas_sspr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3310,7 +3310,7 @@ void cblas_sspr(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_dspr(const Layout layout, const Triangle triangle, +void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3320,8 +3320,8 @@ void cblas_dspr(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); @@ -3340,7 +3340,7 @@ void cblas_dspr(const Layout layout, const Triangle triangle, } // SYR2 -void cblas_ssyr2(const Layout layout, const Triangle triangle, +void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3351,10 +3351,10 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3373,7 +3373,7 @@ void cblas_ssyr2(const Layout layout, const Triangle triangle, } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } -void cblas_dsyr2(const Layout layout, const Triangle triangle, +void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3384,10 +3384,10 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto a_size = n * a_ld; + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto a_buffer = Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3408,7 +3408,7 @@ void cblas_dsyr2(const Layout layout, const Triangle triangle, } // SPR2 -void cblas_sspr2(const Layout layout, const Triangle triangle, +void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const float alpha, const float* x, const int x_inc, @@ -3419,10 +3419,10 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3441,7 +3441,7 @@ void cblas_sspr2(const Layout layout, const Triangle triangle, } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } -void cblas_dspr2(const Layout layout, const Triangle triangle, +void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, const double alpha, const double* x, const int x_inc, @@ -3452,10 +3452,10 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; - auto x_buffer = Buffer(context, x_size); const auto y_size = n * y_inc; - auto y_buffer = Buffer(context, y_size); const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = Buffer(context, x_size); + auto y_buffer = Buffer(context, y_size); auto ap_buffer = Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); @@ -3480,7 +3480,7 @@ void cblas_dspr2(const Layout layout, const Triangle triangle, // ================================================================================================= // GEMM -void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -3492,11 +3492,11 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3517,7 +3517,7 @@ void cblas_sgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -3529,11 +3529,11 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3554,7 +3554,7 @@ void cblas_dgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3566,11 +3566,11 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3591,7 +3591,7 @@ void cblas_cgemm(const Layout layout, const Transpose a_transpose, const Transpo } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, +void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const int m, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3603,11 +3603,11 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? m * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && b_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && b_transpose == Transpose::kNo)) ? k * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3630,7 +3630,7 @@ void cblas_zgemm(const Layout layout, const Transpose a_transpose, const Transpo } // SYMM -void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -3642,11 +3642,11 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3667,7 +3667,7 @@ void cblas_ssymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -3679,11 +3679,11 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3704,7 +3704,7 @@ void cblas_dsymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3716,11 +3716,11 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3741,7 +3741,7 @@ void cblas_csymm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, +void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3753,11 +3753,11 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3780,7 +3780,7 @@ void cblas_zsymm(const Layout layout, const Side side, const Triangle triangle, } // HEMM -void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, +void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3792,11 +3792,11 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3817,7 +3817,7 @@ void cblas_chemm(const Layout layout, const Side side, const Triangle triangle, } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, +void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -3829,11 +3829,11 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : ((side == Side::kLeft) ? m : n) * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? ((side == Side::kLeft) ? m : n) * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = (layout == Layout::kRowMajor) ? m * c_ld : n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -3856,7 +3856,7 @@ void cblas_zhemm(const Layout layout, const Side side, const Triangle triangle, } // SYRK -void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -3867,9 +3867,9 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3888,7 +3888,7 @@ void cblas_ssyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -3899,9 +3899,9 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3920,7 +3920,7 @@ void cblas_dsyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3931,9 +3931,9 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3952,7 +3952,7 @@ void cblas_csyrk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3963,9 +3963,9 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -3986,7 +3986,7 @@ void cblas_zsyrk(const Layout layout, const Triangle triangle, const Transpose a } // HERK -void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -3997,9 +3997,9 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4018,7 +4018,7 @@ void cblas_cherk(const Layout layout, const Triangle triangle, const Transpose a } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a_transpose, +void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4029,9 +4029,9 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; - auto a_buffer = Buffer(context, a_size); + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; + auto a_buffer = Buffer(context, a_size); auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); @@ -4052,7 +4052,7 @@ void cblas_zherk(const Layout layout, const Triangle triangle, const Transpose a } // SYR2K -void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const float alpha, const float* a, const int a_ld, @@ -4064,11 +4064,11 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4089,7 +4089,7 @@ void cblas_ssyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const double alpha, const double* a, const int a_ld, @@ -4101,11 +4101,11 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4126,7 +4126,7 @@ void cblas_dsyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4138,11 +4138,11 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4163,7 +4163,7 @@ void cblas_csyr2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4175,11 +4175,11 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4202,7 +4202,7 @@ void cblas_zsyr2k(const Layout layout, const Triangle triangle, const Transpose } // HER2K -void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4214,11 +4214,11 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4239,7 +4239,7 @@ void cblas_cher2k(const Layout layout, const Triangle triangle, const Transpose } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } -void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, +void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, @@ -4251,11 +4251,11 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; - const auto a_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * a_ld : k * a_ld; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && ab_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && ab_transpose == Transpose::kNo)) ? n * b_ld : k * b_ld; auto b_buffer = Buffer(context, b_size); - const auto c_size = n * c_ld; auto c_buffer = Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4278,7 +4278,7 @@ void cblas_zher2k(const Layout layout, const Triangle triangle, const Transpose } // TRMM -void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4287,9 +4287,9 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4309,7 +4309,7 @@ void cblas_strmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4318,9 +4318,9 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4340,7 +4340,7 @@ void cblas_dtrmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4349,9 +4349,9 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4371,7 +4371,7 @@ void cblas_ctrmm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4380,9 +4380,9 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4404,7 +4404,7 @@ void cblas_ztrmm(const Layout layout, const Side side, const Triangle triangle, } // TRSM -void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4413,9 +4413,9 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4435,7 +4435,7 @@ void cblas_strsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4444,9 +4444,9 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4466,7 +4466,7 @@ void cblas_dtrsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4475,9 +4475,9 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4497,7 +4497,7 @@ void cblas_ctrsm(const Layout layout, const Side side, const Triangle triangle, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, +void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4506,9 +4506,9 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == Side::kLeft) ? m * a_ld : n * a_ld; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = (layout == Layout::kRowMajor) ? m * b_ld : n * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4534,7 +4534,7 @@ void cblas_ztrsm(const Layout layout, const Side side, const Triangle triangle, // ================================================================================================= // OMATCOPY -void cblas_somatcopy(const Layout layout, const Transpose a_transpose, +void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const float alpha, const float* a, const int a_ld, @@ -4543,9 +4543,9 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4562,7 +4562,7 @@ void cblas_somatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_domatcopy(const Layout layout, const Transpose a_transpose, +void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const double alpha, const double* a, const int a_ld, @@ -4571,9 +4571,9 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = alpha; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4590,7 +4590,7 @@ void cblas_domatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_comatcopy(const Layout layout, const Transpose a_transpose, +void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4599,9 +4599,9 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); @@ -4618,7 +4618,7 @@ void cblas_comatcopy(const Layout layout, const Transpose a_transpose, } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } -void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, +void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const int m, const int n, const void* alpha, const void* a, const int a_ld, @@ -4627,9 +4627,9 @@ void cblas_zomatcopy(const Layout layout, const Transpose a_transpose, auto context = Context(device); auto queue = Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == Layout::kRowMajor) ? m * a_ld : n * a_ld; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; auto a_buffer = Buffer(context, a_size); - const auto b_size = ((layout == Layout::kColMajor && a_transpose != Transpose::kNo) || (layout == Layout::kRowMajor && a_transpose == Transpose::kNo)) ? n * b_ld : m * b_ld; auto b_buffer = Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); -- cgit v1.2.3 From 729862e87338dbd275f90d61d52803892fe3648e Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 19:56:42 +0200 Subject: Fixed some issues preventing the Netlib CBLAS API from linking correctly --- include/clblast_blas.h | 20 ++++++++++---------- scripts/generator/generator/routine.py | 4 ++-- src/clblast_blas.cpp | 20 ++++++++++---------- src/utilities/utilities.cpp | 4 ++++ 4 files changed, 26 insertions(+), 22 deletions(-) (limited to 'scripts') diff --git a/include/clblast_blas.h b/include/clblast_blas.h index 927f84cd..ff560712 100644 --- a/include/clblast_blas.h +++ b/include/clblast_blas.h @@ -581,24 +581,24 @@ void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, // Hermitian rank-1 matrix update: CHER/ZHER void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* a, const int a_ld); void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* a, const int a_ld); // Hermitian packed rank-1 matrix update: CHPR/ZHPR void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* ap); void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* ap); @@ -791,15 +791,15 @@ void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle tr // Rank-K update of a hermitian matrix: CHERK/ZHERK void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const float alpha, const void* a, const int a_ld, - const void* beta, + const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const double alpha, const void* a, const int a_ld, - const void* beta, + const double beta, void* c, const int c_ld); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K @@ -838,14 +838,14 @@ void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle t const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const float beta, void* c, const int c_ld); void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, const int n, const int k, const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const double beta, void* c, const int c_ld); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index c35f5b4c..085845a8 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -341,9 +341,9 @@ class Routine: """Retrieves the definition of a scalar (alpha/beta) but make it a void pointer in case of non-standard types""" if name in self.scalars: if name == "alpha": - data_type = "void*" if flavour.is_non_standard() else flavour.alpha_cpp + data_type = "void*" if flavour.is_complex("alpha") else flavour.alpha_cpp return ["const " + data_type + " " + name] - data_type = "void*" if flavour.is_non_standard() else flavour.beta_cpp + data_type = "void*" if flavour.is_complex("beta") else flavour.beta_cpp return ["const " + data_type + " " + name] return [] diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 974dc21a..9b59a20d 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -2974,7 +2974,7 @@ void cblas_zgerc(const CLBlastLayout layout, // HER void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); @@ -3002,7 +3002,7 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, } void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); @@ -3032,7 +3032,7 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, // HPR void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const float alpha, const void* x, const int x_inc, void* ap) { auto device = get_device(); @@ -3060,7 +3060,7 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, } void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const int n, - const void* alpha, + const double alpha, const void* x, const int x_inc, void* ap) { auto device = get_device(); @@ -3988,9 +3988,9 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con // HERK void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const float alpha, const void* a, const int a_ld, - const void* beta, + const float beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4020,9 +4020,9 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con } void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const int n, const int k, - const void* alpha, + const double alpha, const void* a, const int a_ld, - const void* beta, + const double beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4207,7 +4207,7 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const float beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); @@ -4244,7 +4244,7 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* alpha, const void* a, const int a_ld, const void* b, const int b_ld, - const void* beta, + const double beta, void* c, const int c_ld) { auto device = get_device(); auto context = Context(device); diff --git a/src/utilities/utilities.cpp b/src/utilities/utilities.cpp index b4a18311..24456252 100644 --- a/src/utilities/utilities.cpp +++ b/src/utilities/utilities.cpp @@ -151,6 +151,10 @@ std::string ToString(Precision value) { case Precision::kComplexDouble: return ToString(static_cast(value))+" (complex-double)"; } } +template <> +std::string ToString(StatusCode value) { + return std::to_string(static_cast(value)); +} // ================================================================================================= -- cgit v1.2.3 From 140121ef91cc13892711f57da0d046f88cf55301 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 20:21:50 +0200 Subject: Removed the clblast namespace from the Netlib C API source file to ensure proper linking --- CMakeLists.txt | 2 +- scripts/generator/generator.py | 2 +- scripts/generator/generator/cpp.py | 12 +- scripts/generator/generator/routine.py | 2 +- src/clblast_blas.cpp | 4181 ++++++++++++++++---------------- 5 files changed, 2099 insertions(+), 2100 deletions(-) (limited to 'scripts') diff --git a/CMakeLists.txt b/CMakeLists.txt index d2034617..1fff1a3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,7 +163,6 @@ set(PRECISIONS 32 64 3232 6464 16) # Gathers all source-files set(SOURCES - src/clblast_blas.cpp src/database/database.cpp src/routines/common.cpp src/utilities/clblast_exceptions.cpp @@ -171,6 +170,7 @@ set(SOURCES src/cache.cpp src/clblast.cpp src/clblast_c.cpp + src/clblast_blas.cpp src/routine.cpp ) foreach(ROUTINE ${LEVEL1_ROUTINES}) diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index a9169872..65d40877 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -42,7 +42,7 @@ FILES = [ "/src/clblast_blas.cpp", ] HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] -FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 3] +FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements ald_m = "The value of `a_ld` must be at least `m`." diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index eafbea30..60e29a07 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -112,13 +112,13 @@ def clblast_blas_cc(routine): # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" - indent = " " * (12 + routine.length() + len(template)) + indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL # Initialize OpenCL result += " auto device = get_device();" + NL - result += " auto context = Context(device);" + NL - result += " auto queue = Queue(context, device);" + NL + result += " auto context = clblast::Context(device);" + NL + result += " auto queue = clblast::Queue(context, device);" + NL # Set alpha and beta result += "".join(" " + s + NL for s in routine.scalar_create_cpp(flavour)) @@ -134,13 +134,13 @@ def clblast_blas_cc(routine): # The function call result += " auto queue_cl = queue();" + NL - result += " auto s = " + routine.name.capitalize() + template + "(" + result += " auto s = clblast::" + routine.name.capitalize() + template + "(" result += ("," + NL + indent).join([a for a in routine.arguments_netlib(flavour, indent)]) result += "," + NL + indent + "&queue_cl);" + NL # Error handling - result += " if (s != StatusCode::kSuccess) {" + NL - result += " throw std::runtime_error(\"CLBlast returned with error code \" + ToString(s));" + NL + result += " if (s != clblast::StatusCode::kSuccess) {" + NL + result += " throw std::runtime_error(\"CLBlast returned with error code \" + clblast::ToString(s));" + NL result += " }" + NL # Copy back and clean-up diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 085845a8..097376ad 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -75,7 +75,7 @@ class Routine: @staticmethod def create_buffer(name, template): """Creates a new CLCudaAPI buffer""" - return "auto " + name + "_buffer = Buffer<" + template + ">(context, " + name + "_size);" + return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" @staticmethod def write_buffer(name, template): diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp index 9b59a20d..6cc14583 100644 --- a/src/clblast_blas.cpp +++ b/src/clblast_blas.cpp @@ -19,16 +19,16 @@ #include "clblast.h" #include "utilities/utilities.hpp" -namespace clblast { - -// ================================================================================================= +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; // Helper function to get a default OpenCL platform and device -Device get_device() { - auto platform_id = ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); - auto device_id = ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); - auto platform = Platform(platform_id); - return Device(platform, device_id); +clblast::Device get_device() { + auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = clblast::Platform(platform_id); + return clblast::Device(platform, device_id); } // ================================================================================================= @@ -41,28 +41,28 @@ void cblas_srotg(float* sa, float* sc, float* ss) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; - auto sa_buffer = Buffer(context, sa_size); - auto sb_buffer = Buffer(context, sb_size); - auto sc_buffer = Buffer(context, sc_size); - auto ss_buffer = Buffer(context, ss_size); + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); - auto s = Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); @@ -74,28 +74,28 @@ void cblas_drotg(double* sa, double* sc, double* ss) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sa_size = 1; const auto sb_size = 1; const auto sc_size = 1; const auto ss_size = 1; - auto sa_buffer = Buffer(context, sa_size); - auto sb_buffer = Buffer(context, sb_size); - auto sc_buffer = Buffer(context, sc_size); - auto ss_buffer = Buffer(context, ss_size); + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); auto queue_cl = queue(); - auto s = Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); @@ -110,32 +110,32 @@ void cblas_srotmg(float* sd1, const float* sy1, float* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); - auto sd1_buffer = Buffer(context, sd1_size); - auto sd2_buffer = Buffer(context, sd2_size); - auto sx1_buffer = Buffer(context, sx1_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); @@ -148,32 +148,32 @@ void cblas_drotmg(double* sd1, const double* sy1, double* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto sy1_size = 1; const auto sd1_size = 1; const auto sd2_size = 1; const auto sx1_size = 1; const auto sparam_size = 1; - auto sy1_buffer = Buffer(context, sy1_size); - auto sd1_buffer = Buffer(context, sd1_size); - auto sd2_buffer = Buffer(context, sd2_size); - auto sx1_buffer = Buffer(context, sx1_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); @@ -188,23 +188,23 @@ void cblas_srot(const int n, const float cos, const float sin) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -215,23 +215,23 @@ void cblas_drot(const int n, const double cos, const double sin) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -243,25 +243,25 @@ void cblas_srotm(const int n, float* y, const int y_inc, float* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto sparam_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -272,25 +272,25 @@ void cblas_drotm(const int n, double* y, const int y_inc, double* sparam) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto sparam_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto sparam_buffer = Buffer(context, sparam_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); auto queue_cl = queue(); - auto s = Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -302,21 +302,21 @@ void cblas_sswap(const int n, float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -325,21 +325,21 @@ void cblas_dswap(const int n, double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -348,21 +348,21 @@ void cblas_cswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -371,21 +371,21 @@ void cblas_zswap(const int n, void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); y_buffer.Read(queue, y_size, reinterpret_cast(y)); @@ -396,19 +396,19 @@ void cblas_sscal(const int n, const float alpha, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -416,19 +416,19 @@ void cblas_dscal(const int n, const double alpha, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -436,19 +436,19 @@ void cblas_cscal(const int n, const void* alpha, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -456,19 +456,19 @@ void cblas_zscal(const int n, const void* alpha, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; - auto x_buffer = Buffer(context, x_size); + auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -478,21 +478,21 @@ void cblas_scopy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -500,21 +500,21 @@ void cblas_dcopy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -522,21 +522,21 @@ void cblas_ccopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -544,21 +544,21 @@ void cblas_zcopy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -569,23 +569,23 @@ void cblas_saxpy(const int n, const float* x, const int x_inc, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -594,23 +594,23 @@ void cblas_daxpy(const int n, const double* x, const int x_inc, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -619,23 +619,23 @@ void cblas_caxpy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -644,23 +644,23 @@ void cblas_zaxpy(const int n, const void* x, const int x_inc, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n; const auto y_size = n; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -671,25 +671,25 @@ void cblas_sdot(const int n, const float* x, const int x_inc, const float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -698,25 +698,25 @@ void cblas_ddot(const int n, const double* x, const int x_inc, const double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -727,25 +727,25 @@ void cblas_cdotu(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -754,25 +754,25 @@ void cblas_zdotu(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -783,25 +783,25 @@ void cblas_cdotc(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -810,25 +810,25 @@ void cblas_zdotc(const int n, const void* x, const int x_inc, const void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto y_size = n; const auto dot_size = 1; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto dot_buffer = Buffer(context, dot_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); - auto s = Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); } @@ -838,21 +838,21 @@ void cblas_snrm2(const int n, float* nrm2, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -860,21 +860,21 @@ void cblas_dnrm2(const int n, double* nrm2, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -882,21 +882,21 @@ void cblas_scnrm2(const int n, void* nrm2, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -904,21 +904,21 @@ void cblas_dznrm2(const int n, void* nrm2, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto nrm2_size = 1; - auto x_buffer = Buffer(context, x_size); - auto nrm2_buffer = Buffer(context, nrm2_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); - auto s = Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); } @@ -928,21 +928,21 @@ void cblas_sasum(const int n, float* asum, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -950,21 +950,21 @@ void cblas_dasum(const int n, double* asum, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -972,21 +972,21 @@ void cblas_scasum(const int n, void* asum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -994,21 +994,21 @@ void cblas_dzasum(const int n, void* asum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto asum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto asum_buffer = Buffer(context, asum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); - auto s = Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); } @@ -1018,21 +1018,21 @@ void cblas_ssum(const int n, float* sum, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1040,21 +1040,21 @@ void cblas_dsum(const int n, double* sum, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1062,21 +1062,21 @@ void cblas_scsum(const int n, void* sum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1084,21 +1084,21 @@ void cblas_dzsum(const int n, void* sum, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto sum_size = 1; - auto x_buffer = Buffer(context, x_size); - auto sum_buffer = Buffer(context, sum_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); - auto s = Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); } @@ -1108,21 +1108,21 @@ void cblas_isamax(const int n, float* imax, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1130,21 +1130,21 @@ void cblas_idamax(const int n, double* imax, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1152,21 +1152,21 @@ void cblas_icamax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1174,21 +1174,21 @@ void cblas_izamax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1198,21 +1198,21 @@ void cblas_ismax(const int n, float* imax, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1220,21 +1220,21 @@ void cblas_idmax(const int n, double* imax, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1242,21 +1242,21 @@ void cblas_icmax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1264,21 +1264,21 @@ void cblas_izmax(const int n, void* imax, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imax_buffer = Buffer(context, imax_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); - auto s = Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); } @@ -1288,21 +1288,21 @@ void cblas_ismin(const int n, float* imin, const float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1310,21 +1310,21 @@ void cblas_idmin(const int n, double* imin, const double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1332,21 +1332,21 @@ void cblas_icmin(const int n, void* imin, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1354,21 +1354,21 @@ void cblas_izmin(const int n, void* imin, const void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; - auto x_buffer = Buffer(context, x_size); - auto imin_buffer = Buffer(context, imin_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); - auto s = Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); } @@ -1386,31 +1386,31 @@ void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1422,31 +1422,31 @@ void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1458,31 +1458,31 @@ void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1494,31 +1494,31 @@ void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1532,31 +1532,31 @@ void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1568,31 +1568,31 @@ void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1604,31 +1604,31 @@ void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1640,31 +1640,31 @@ void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1678,31 +1678,31 @@ void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1714,31 +1714,31 @@ void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1752,31 +1752,31 @@ void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1788,31 +1788,31 @@ void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1826,31 +1826,31 @@ void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1862,31 +1862,31 @@ void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* beta, void* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1900,31 +1900,31 @@ void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1936,31 +1936,31 @@ void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -1974,31 +1974,31 @@ void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2010,31 +2010,31 @@ void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = n * a_ld; const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2048,31 +2048,31 @@ void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const float beta, float* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2084,31 +2084,31 @@ void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const double beta, double* y, const int y_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; const auto y_size = n * y_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); auto queue_cl = queue(); - auto s = Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } y_buffer.Read(queue, y_size, reinterpret_cast(y)); } @@ -2119,25 +2119,25 @@ void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2146,25 +2146,25 @@ void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2173,25 +2173,25 @@ void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2200,25 +2200,25 @@ void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2229,25 +2229,25 @@ void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2256,25 +2256,25 @@ void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2283,25 +2283,25 @@ void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2310,25 +2310,25 @@ void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2339,25 +2339,25 @@ void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* ap, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2366,25 +2366,25 @@ void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* ap, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2393,25 +2393,25 @@ void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2420,25 +2420,25 @@ void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2449,25 +2449,25 @@ void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2476,25 +2476,25 @@ void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2503,25 +2503,25 @@ void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2530,25 +2530,25 @@ void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2559,25 +2559,25 @@ void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* a, const int a_ld, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2586,25 +2586,25 @@ void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* a, const int a_ld, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2613,25 +2613,25 @@ void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2640,25 +2640,25 @@ void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* a, const int a_ld, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto a_size = n * a_ld; const auto x_size = n * x_inc; - auto a_buffer = Buffer(context, a_size); - auto x_buffer = Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2669,25 +2669,25 @@ void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float* ap, float* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2696,25 +2696,25 @@ void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double* ap, double* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2723,25 +2723,25 @@ void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2750,25 +2750,25 @@ void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* ap, void* x, const int x_inc) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto ap_size = ((n*(n+1)) / 2); const auto x_size = n * x_inc; - auto ap_buffer = Buffer(context, ap_size); - auto x_buffer = Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); - auto s = Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } x_buffer.Read(queue, x_size, reinterpret_cast(x)); } @@ -2781,28 +2781,28 @@ void cblas_sger(const CLBlastLayout layout, const float* y, const int y_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2813,28 +2813,28 @@ void cblas_dger(const CLBlastLayout layout, const double* y, const int y_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2847,28 +2847,28 @@ void cblas_cgeru(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2879,28 +2879,28 @@ void cblas_zgeru(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2913,28 +2913,28 @@ void cblas_cgerc(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2945,28 +2945,28 @@ void cblas_zgerc(const CLBlastLayout layout, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = m * x_inc; const auto y_size = n * y_inc; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -2978,25 +2978,25 @@ void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3006,25 +3006,25 @@ void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3036,25 +3036,25 @@ void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3064,25 +3064,25 @@ void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* x, const int x_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3095,29 +3095,29 @@ void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3128,29 +3128,29 @@ void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3163,29 +3163,29 @@ void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3196,29 +3196,29 @@ void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const void* y, const int y_inc, void* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3230,25 +3230,25 @@ void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3258,25 +3258,25 @@ void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3288,25 +3288,25 @@ void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* x, const int x_inc, float* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3316,25 +3316,25 @@ void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* x, const int x_inc, double* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3347,29 +3347,29 @@ void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* y, const int y_inc, float* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3380,29 +3380,29 @@ void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* y, const int y_inc, double* a, const int a_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto a_size = n * a_ld; - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto a_buffer = Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); a_buffer.Write(queue, a_size, reinterpret_cast(a)); auto queue_cl = queue(); - auto s = Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } a_buffer.Read(queue, a_size, reinterpret_cast(a)); } @@ -3415,29 +3415,29 @@ void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const float* y, const int y_inc, float* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3448,29 +3448,29 @@ void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, const double* y, const int y_inc, double* ap) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto x_size = n * x_inc; const auto y_size = n * y_inc; const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = Buffer(context, x_size); - auto y_buffer = Buffer(context, y_size); - auto ap_buffer = Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); auto queue_cl = queue(); - auto s = Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); } @@ -3488,32 +3488,32 @@ void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3525,32 +3525,32 @@ void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3562,32 +3562,32 @@ void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3599,32 +3599,32 @@ void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3638,32 +3638,32 @@ void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3675,32 +3675,32 @@ void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3712,32 +3712,32 @@ void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3749,32 +3749,32 @@ void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3788,32 +3788,32 @@ void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3825,32 +3825,32 @@ void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3863,28 +3863,28 @@ void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3895,28 +3895,28 @@ void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3927,28 +3927,28 @@ void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3959,28 +3959,28 @@ void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -3993,28 +3993,28 @@ void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const float beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4025,28 +4025,28 @@ void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, con const double beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4060,32 +4060,32 @@ void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float beta, float* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4097,32 +4097,32 @@ void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double beta, double* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4134,32 +4134,32 @@ void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4171,32 +4171,32 @@ void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const void* beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4210,32 +4210,32 @@ void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const float beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4247,32 +4247,32 @@ void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, co const double beta, void* c, const int c_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto beta_cpp = beta; const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; const auto c_size = n * c_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); - auto c_buffer = Buffer(context, c_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); c_buffer.Write(queue, c_size, reinterpret_cast(c)); auto queue_cl = queue(); - auto s = Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } c_buffer.Read(queue, c_size, reinterpret_cast(c)); } @@ -4284,28 +4284,28 @@ void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4315,28 +4315,28 @@ void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4346,28 +4346,28 @@ void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4377,28 +4377,28 @@ void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4410,28 +4410,28 @@ void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4441,28 +4441,28 @@ void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4472,28 +4472,28 @@ void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4503,28 +4503,28 @@ void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBla const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4540,25 +4540,25 @@ void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const float* a, const int a_ld, float* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4568,25 +4568,25 @@ void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const double* a, const int a_ld, double* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4596,25 +4596,25 @@ void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } @@ -4624,28 +4624,27 @@ void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transp const void* a, const int a_ld, void* b, const int b_ld) { auto device = get_device(); - auto context = Context(device); - auto queue = Queue(context, device); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = Buffer(context, a_size); - auto b_buffer = Buffer(context, b_size); + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); a_buffer.Write(queue, a_size, reinterpret_cast(a)); b_buffer.Write(queue, b_size, reinterpret_cast(b)); auto queue_cl = queue(); - auto s = Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + ToString(s)); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } b_buffer.Read(queue, b_size, reinterpret_cast(b)); } // ================================================================================================= -} // namespace clblast -- cgit v1.2.3 From 8ae8ab06a2b6f24faa0de5d390a5ae272aa94c23 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 25 Oct 2016 20:33:10 +0200 Subject: Renamed the include and source files of the Netlib CBLAS API --- CMakeLists.txt | 4 +- include/clblast_blas.h | 928 ------- include/clblast_netlib_c.h | 928 +++++++ scripts/generator/generator.py | 12 +- scripts/generator/generator/cpp.py | 4 +- src/clblast_blas.cpp | 4650 ------------------------------------ src/clblast_netlib_c.cpp | 4650 ++++++++++++++++++++++++++++++++++++ 7 files changed, 5588 insertions(+), 5588 deletions(-) delete mode 100644 include/clblast_blas.h create mode 100644 include/clblast_netlib_c.h delete mode 100644 src/clblast_blas.cpp create mode 100644 src/clblast_netlib_c.cpp (limited to 'scripts') diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fff1a3a..aa1e287e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -170,7 +170,7 @@ set(SOURCES src/cache.cpp src/clblast.cpp src/clblast_c.cpp - src/clblast_blas.cpp + src/clblast_netlib_c.cpp src/routine.cpp ) foreach(ROUTINE ${LEVEL1_ROUTINES}) @@ -214,7 +214,7 @@ install(TARGETS clblast EXPORT CLBlast DESTINATION lib) install(FILES include/clblast.h DESTINATION include) install(FILES include/clblast_c.h DESTINATION include) install(FILES include/clblast_half.h DESTINATION include) -install(FILES include/clblast_blas.h DESTINATION include) +install(FILES include/clblast_netlib_c.h DESTINATION include) # Installs the config for find_package in dependent projects install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake) diff --git a/include/clblast_blas.h b/include/clblast_blas.h deleted file mode 100644 index ff560712..00000000 --- a/include/clblast_blas.h +++ /dev/null @@ -1,928 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer -// copies automatically and running on the default OpenCL platform and device. For full control over -// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. -// -// ================================================================================================= - -#ifndef CLBLAST_CLBLAST_BLAS_H_ -#define CLBLAST_CLBLAST_BLAS_H_ - -// Exports library functions under Windows when building a DLL. See also: -// https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#if defined(_WIN32) && defined(CLBLAST_DLL) - #if defined(COMPILING_DLL) - #define PUBLIC_API __declspec(dllexport) - #else - #define PUBLIC_API __declspec(dllimport) - #endif -#else - #define PUBLIC_API -#endif - -// The C interface -#ifdef __cplusplus -extern "C" { -#endif - -// ================================================================================================= - -// Matrix layout and transpose types -typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, - CLBlastLayoutColMajor = 102 } CLBlastLayout; -typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, - CLBlastTransposeConjugate = 113 } CLBlastTranspose; -typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, - CLBlastTriangleLower = 122 } CLBlastTriangle; -typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, - CLBlastDiagonalUnit = 132 } CLBlastDiagonal; -typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// Generate givens plane rotation: SROTG/DROTG -void PUBLIC_API cblas_srotg(float* sa, - float* sb, - float* sc, - float* ss); -void PUBLIC_API cblas_drotg(double* sa, - double* sb, - double* sc, - double* ss); - -// Generate modified givens plane rotation: SROTMG/DROTMG -void PUBLIC_API cblas_srotmg(float* sd1, - float* sd2, - float* sx1, - const float* sy1, - float* sparam); -void PUBLIC_API cblas_drotmg(double* sd1, - double* sd2, - double* sx1, - const double* sy1, - double* sparam); - -// Apply givens plane rotation: SROT/DROT -void PUBLIC_API cblas_srot(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - const float cos, - const float sin); -void PUBLIC_API cblas_drot(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - const double cos, - const double sin); - -// Apply modified givens plane rotation: SROTM/DROTM -void PUBLIC_API cblas_srotm(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - float* sparam); -void PUBLIC_API cblas_drotm(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - double* sparam); - -// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP -void PUBLIC_API cblas_sswap(const int n, - float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_dswap(const int n, - double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_cswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc); - -// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL -void PUBLIC_API cblas_sscal(const int n, - const float alpha, - float* x, const int x_inc); -void PUBLIC_API cblas_dscal(const int n, - const double alpha, - double* x, const int x_inc); -void PUBLIC_API cblas_cscal(const int n, - const void* alpha, - void* x, const int x_inc); -void PUBLIC_API cblas_zscal(const int n, - const void* alpha, - void* x, const int x_inc); - -// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY -void PUBLIC_API cblas_scopy(const int n, - const float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_dcopy(const int n, - const double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_ccopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc); - -// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY -void PUBLIC_API cblas_saxpy(const int n, - const float alpha, - const float* x, const int x_inc, - float* y, const int y_inc); -void PUBLIC_API cblas_daxpy(const int n, - const double alpha, - const double* x, const int x_inc, - double* y, const int y_inc); -void PUBLIC_API cblas_caxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); -void PUBLIC_API cblas_zaxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc); - -// Dot product of two vectors: SDOT/DDOT/HDOT -void PUBLIC_API cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc); -void PUBLIC_API cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc); - -// Dot product of two complex vectors: CDOTU/ZDOTU -void PUBLIC_API cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); - -// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -void PUBLIC_API cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); - -// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -void PUBLIC_API cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc); -void PUBLIC_API cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc); -void PUBLIC_API cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); -void PUBLIC_API cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc); - -// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -void PUBLIC_API cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc); - -// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -void PUBLIC_API cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -void PUBLIC_API cblas_isamax(const int n, - float* imax, - const float* x, const int x_inc); -void PUBLIC_API cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc); -void PUBLIC_API cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc); -void PUBLIC_API cblas_izamax(const int n, - void* imax, - const void* x, const int x_inc); - -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -void PUBLIC_API cblas_ismax(const int n, - float* imax, - const float* x, const int x_inc); -void PUBLIC_API cblas_idmax(const int n, - double* imax, - const double* x, const int x_inc); -void PUBLIC_API cblas_icmax(const int n, - void* imax, - const void* x, const int x_inc); -void PUBLIC_API cblas_izmax(const int n, - void* imax, - const void* x, const int x_inc); - -// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -void PUBLIC_API cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc); -void PUBLIC_API cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc); -void PUBLIC_API cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc); -void PUBLIC_API cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc); - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV -void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); -void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV -void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); -void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian matrix-vector multiplication: CHEMV/ZHEMV -void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV -void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV -void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); -void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc); - -// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV -void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV -void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV -void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* ap, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc); -void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* ap, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc); - -// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV -void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV -void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV -void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc); -void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc); -void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); -void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); - -// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV -void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV -void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc); -void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc); -void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); -void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc); - -// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV -void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc); -void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc); -void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); -void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc); - -// General rank-1 matrix update: SGER/DGER/HGER -void PUBLIC_API cblas_sger(const CLBlastLayout layout, - const int m, const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dger(const CLBlastLayout layout, - const int m, const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld); - -// General rank-1 complex matrix update: CGERU/ZGERU -void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// General rank-1 complex conjugated matrix update: CGERC/ZGERC -void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// Hermitian rank-1 matrix update: CHER/ZHER -void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* a, const int a_ld); - -// Hermitian packed rank-1 matrix update: CHPR/ZHPR -void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* ap); -void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* ap); - -// Hermitian rank-2 matrix update: CHER2/ZHER2 -void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); -void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld); - -// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 -void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); -void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap); - -// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR -void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* a, const int a_ld); - -// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR -void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* ap); -void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* ap); - -// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 -void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld); -void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld); - -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 -void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* ap); -void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* ap); - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM -void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM -void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM -void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK -void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-K update of a hermitian matrix: CHERK/ZHERK -void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const void* a, const int a_ld, - const float beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const void* a, const int a_ld, - const double beta, - void* c, const int c_ld); - -// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K -void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld); -void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld); -void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld); - -// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K -void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const float beta, - void* c, const int c_ld); -void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const double beta, - void* c, const int c_ld); - -// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM -void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM -void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY -void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld); -void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld); -void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); -void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld); - -// ================================================================================================= - -#ifdef __cplusplus -} // extern "C" -#endif - -// CLBLAST_CLBLAST_BLAS_H_ -#endif diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h new file mode 100644 index 00000000..c233646e --- /dev/null +++ b/include/clblast_netlib_c.h @@ -0,0 +1,928 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS interface to the CLBlast BLAS routines, performing all buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#ifndef CLBLAST_CLBLAST_NETLIB_C_H_ +#define CLBLAST_CLBLAST_NETLIB_C_H_ + +// Exports library functions under Windows when building a DLL. See also: +// https://msdn.microsoft.com/en-us/library/a90k134d.aspx +#if defined(_WIN32) && defined(CLBLAST_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif +#else + #define PUBLIC_API +#endif + +// The C interface +#ifdef __cplusplus +extern "C" { +#endif + +// ================================================================================================= + +// Matrix layout and transpose types +typedef enum CLBlastLayout_ { CLBlastLayoutRowMajor = 101, + CLBlastLayoutColMajor = 102 } CLBlastLayout; +typedef enum CLBlastTranspose_ { CLBlastTransposeNo = 111, CLBlastTransposeYes = 112, + CLBlastTransposeConjugate = 113 } CLBlastTranspose; +typedef enum CLBlastTriangle_ { CLBlastTriangleUpper = 121, + CLBlastTriangleLower = 122 } CLBlastTriangle; +typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, + CLBlastDiagonalUnit = 132 } CLBlastDiagonal; +typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Generate givens plane rotation: SROTG/DROTG +void PUBLIC_API cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss); +void PUBLIC_API cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss); + +// Generate modified givens plane rotation: SROTMG/DROTMG +void PUBLIC_API cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam); +void PUBLIC_API cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam); + +// Apply givens plane rotation: SROT/DROT +void PUBLIC_API cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin); +void PUBLIC_API cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin); + +// Apply modified givens plane rotation: SROTM/DROTM +void PUBLIC_API cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam); +void PUBLIC_API cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam); + +// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP +void PUBLIC_API cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc); + +// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL +void PUBLIC_API cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc); +void PUBLIC_API cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc); +void PUBLIC_API cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc); +void PUBLIC_API cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc); + +// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY +void PUBLIC_API cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc); + +// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY +void PUBLIC_API cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc); +void PUBLIC_API cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc); +void PUBLIC_API cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc); +void PUBLIC_API cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc); + +// Dot product of two vectors: SDOT/DDOT/HDOT +void PUBLIC_API cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc); +void PUBLIC_API cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc); + +// Dot product of two complex vectors: CDOTU/ZDOTU +void PUBLIC_API cblas_cdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); +void PUBLIC_API cblas_zdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); + +// Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC +void PUBLIC_API cblas_cdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); +void PUBLIC_API cblas_zdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc); + +// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 +void PUBLIC_API cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc); +void PUBLIC_API cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc); +void PUBLIC_API cblas_scnrm2(const int n, + void* nrm2, + const void* x, const int x_inc); +void PUBLIC_API cblas_dznrm2(const int n, + void* nrm2, + const void* x, const int x_inc); + +// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM +void PUBLIC_API cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc); +void PUBLIC_API cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc); +void PUBLIC_API cblas_scasum(const int n, + void* asum, + const void* x, const int x_inc); +void PUBLIC_API cblas_dzasum(const int n, + void* asum, + const void* x, const int x_inc); + +// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM +void PUBLIC_API cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc); +void PUBLIC_API cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc); +void PUBLIC_API cblas_scsum(const int n, + void* sum, + const void* x, const int x_inc); +void PUBLIC_API cblas_dzsum(const int n, + void* sum, + const void* x, const int x_inc); + +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +void PUBLIC_API cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc); +void PUBLIC_API cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc); +void PUBLIC_API cblas_icamax(const int n, + void* imax, + const void* x, const int x_inc); +void PUBLIC_API cblas_izamax(const int n, + void* imax, + const void* x, const int x_inc); + +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +void PUBLIC_API cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc); +void PUBLIC_API cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc); +void PUBLIC_API cblas_icmax(const int n, + void* imax, + const void* x, const int x_inc); +void PUBLIC_API cblas_izmax(const int n, + void* imax, + const void* x, const int x_inc); + +// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN +void PUBLIC_API cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc); +void PUBLIC_API cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc); +void PUBLIC_API cblas_icmin(const int n, + void* imin, + const void* x, const int x_inc); +void PUBLIC_API cblas_izmin(const int n, + void* imin, + const void* x, const int x_inc); + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV +void PUBLIC_API cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV +void PUBLIC_API cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); +void PUBLIC_API cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian matrix-vector multiplication: CHEMV/ZHEMV +void PUBLIC_API cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV +void PUBLIC_API cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV +void PUBLIC_API cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); +void PUBLIC_API cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc); + +// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV +void PUBLIC_API cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV +void PUBLIC_API cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV +void PUBLIC_API cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc); +void PUBLIC_API cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc); + +// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV +void PUBLIC_API cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV +void PUBLIC_API cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV +void PUBLIC_API cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); +void PUBLIC_API cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); + +// Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV +void PUBLIC_API cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV +void PUBLIC_API cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc); +void PUBLIC_API cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc); +void PUBLIC_API cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); +void PUBLIC_API cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc); + +// Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV +void PUBLIC_API cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc); +void PUBLIC_API cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc); +void PUBLIC_API cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); +void PUBLIC_API cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc); + +// General rank-1 matrix update: SGER/DGER/HGER +void PUBLIC_API cblas_sger(const CLBlastLayout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dger(const CLBlastLayout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); + +// General rank-1 complex matrix update: CGERU/ZGERU +void PUBLIC_API cblas_cgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// General rank-1 complex conjugated matrix update: CGERC/ZGERC +void PUBLIC_API cblas_cgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// Hermitian rank-1 matrix update: CHER/ZHER +void PUBLIC_API cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* a, const int a_ld); + +// Hermitian packed rank-1 matrix update: CHPR/ZHPR +void PUBLIC_API cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* ap); +void PUBLIC_API cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* ap); + +// Hermitian rank-2 matrix update: CHER2/ZHER2 +void PUBLIC_API cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); +void PUBLIC_API cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld); + +// Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 +void PUBLIC_API cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); +void PUBLIC_API cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap); + +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR +void PUBLIC_API cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld); + +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR +void PUBLIC_API cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap); +void PUBLIC_API cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap); + +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 +void PUBLIC_API cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld); +void PUBLIC_API cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld); + +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 +void PUBLIC_API cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap); +void PUBLIC_API cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap); + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM +void PUBLIC_API cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM +void PUBLIC_API cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Hermitian matrix-matrix multiplication: CHEMM/ZHEMM +void PUBLIC_API cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK +void PUBLIC_API cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-K update of a hermitian matrix: CHERK/ZHERK +void PUBLIC_API cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const void* a, const int a_ld, + const float beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const void* a, const int a_ld, + const double beta, + void* c, const int c_ld); + +// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K +void PUBLIC_API cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld); +void PUBLIC_API cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld); +void PUBLIC_API cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld); + +// Rank-2K update of a hermitian matrix: CHER2K/ZHER2K +void PUBLIC_API cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const float beta, + void* c, const int c_ld); +void PUBLIC_API cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const double beta, + void* c, const int c_ld); + +// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM +void PUBLIC_API cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM +void PUBLIC_API cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY +void PUBLIC_API cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld); +void PUBLIC_API cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld); +void PUBLIC_API cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); +void PUBLIC_API cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld); + +// ================================================================================================= + +#ifdef __cplusplus +} // extern "C" +#endif + +// CLBLAST_CLBLAST_NETLIB_C_H_ +#endif diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 65d40877..1a467340 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -12,8 +12,8 @@ # clblast.cpp # clblast_c.h # clblast_c.cpp -# clblast_blas.h -# clblast_blas.cpp +# clblast_netlib_c.h +# clblast_netlib_c.cpp # wrapper_clblas.h # wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in @@ -38,8 +38,8 @@ FILES = [ "/src/clblast_c.cpp", "/test/wrapper_clblas.hpp", "/test/wrapper_cblas.hpp", - "/include/clblast_blas.h", - "/src/clblast_blas.cpp", + "/include/clblast_netlib_c.h", + "/src/clblast_netlib_c.cpp", ] HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] @@ -205,9 +205,9 @@ def main(argv): if i == 5: body += cpp.wrapper_cblas(routine) if i == 6: - body += cpp.clblast_blas_h(routine) + body += cpp.clblast_netlib_c_h(routine) if i == 7: - body += cpp.clblast_blas_cc(routine) + body += cpp.clblast_netlib_c_cc(routine) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 60e29a07..9d4ef6c4 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -95,7 +95,7 @@ def clblast_c_cc(routine): return result -def clblast_blas_h(routine): +def clblast_netlib_c_h(routine): """The Netlib CBLAS API header (.h)""" result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: @@ -104,7 +104,7 @@ def clblast_blas_h(routine): return result -def clblast_blas_cc(routine): +def clblast_netlib_c_cc(routine): """The Netlib CBLAS API implementation (.cpp)""" result = NL + "// " + routine.name.upper() + NL for flavour in routine.flavours: diff --git a/src/clblast_blas.cpp b/src/clblast_blas.cpp deleted file mode 100644 index 6cc14583..00000000 --- a/src/clblast_blas.cpp +++ /dev/null @@ -1,4650 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer -// copies automatically and running on the default OpenCL platform and device. For full control over -// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. -// -// ================================================================================================= - -#include - -#include "clblast_blas.h" -#include "clblast.h" -#include "utilities/utilities.hpp" - -// Shortcuts to the clblast namespace -using float2 = clblast::float2; -using double2 = clblast::double2; - -// Helper function to get a default OpenCL platform and device -clblast::Device get_device() { - auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); - auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); - auto platform = clblast::Platform(platform_id); - return clblast::Device(platform, device_id); -} - -// ================================================================================================= -// BLAS level-1 (vector-vector) routines -// ================================================================================================= - -// ROTG -void cblas_srotg(float* sa, - float* sb, - float* sc, - float* ss) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto sa_size = 1; - const auto sb_size = 1; - const auto sc_size = 1; - const auto ss_size = 1; - auto sa_buffer = clblast::Buffer(context, sa_size); - auto sb_buffer = clblast::Buffer(context, sb_size); - auto sc_buffer = clblast::Buffer(context, sc_size); - auto ss_buffer = clblast::Buffer(context, ss_size); - sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); - sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); - sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); - ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); - auto queue_cl = queue(); - auto s = clblast::Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); - sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); - sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); - ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); -} -void cblas_drotg(double* sa, - double* sb, - double* sc, - double* ss) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto sa_size = 1; - const auto sb_size = 1; - const auto sc_size = 1; - const auto ss_size = 1; - auto sa_buffer = clblast::Buffer(context, sa_size); - auto sb_buffer = clblast::Buffer(context, sb_size); - auto sc_buffer = clblast::Buffer(context, sc_size); - auto ss_buffer = clblast::Buffer(context, ss_size); - sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); - sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); - sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); - ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); - auto queue_cl = queue(); - auto s = clblast::Rotg(sa_buffer(), 0, - sb_buffer(), 0, - sc_buffer(), 0, - ss_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); - sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); - sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); - ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); -} - -// ROTMG -void cblas_srotmg(float* sd1, - float* sd2, - float* sx1, - const float* sy1, - float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto sy1_size = 1; - const auto sd1_size = 1; - const auto sd2_size = 1; - const auto sx1_size = 1; - const auto sparam_size = 1; - auto sy1_buffer = clblast::Buffer(context, sy1_size); - auto sd1_buffer = clblast::Buffer(context, sd1_size); - auto sd2_buffer = clblast::Buffer(context, sd2_size); - auto sx1_buffer = clblast::Buffer(context, sx1_size); - auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); - sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); - sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); - sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); - sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); - auto queue_cl = queue(); - auto s = clblast::Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); - sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); - sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); - sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); -} -void cblas_drotmg(double* sd1, - double* sd2, - double* sx1, - const double* sy1, - double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto sy1_size = 1; - const auto sd1_size = 1; - const auto sd2_size = 1; - const auto sx1_size = 1; - const auto sparam_size = 1; - auto sy1_buffer = clblast::Buffer(context, sy1_size); - auto sd1_buffer = clblast::Buffer(context, sd1_size); - auto sd2_buffer = clblast::Buffer(context, sd2_size); - auto sx1_buffer = clblast::Buffer(context, sx1_size); - auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); - sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); - sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); - sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); - sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); - auto queue_cl = queue(); - auto s = clblast::Rotmg(sd1_buffer(), 0, - sd2_buffer(), 0, - sx1_buffer(), 0, - sy1_buffer(), 0, - sparam_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); - sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); - sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); - sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); -} - -// ROT -void cblas_srot(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - const float cos, - const float sin) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_drot(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - const double cos, - const double sin) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Rot(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - cos, - sin, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// ROTM -void cblas_srotm(const int n, - float* x, const int x_inc, - float* y, const int y_inc, - float* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto sparam_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto sparam_buffer = clblast::Buffer(context, sparam_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); - auto queue_cl = queue(); - auto s = clblast::Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); - sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); -} -void cblas_drotm(const int n, - double* x, const int x_inc, - double* y, const int y_inc, - double* sparam) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto sparam_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto sparam_buffer = clblast::Buffer(context, sparam_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); - auto queue_cl = queue(); - auto s = clblast::Rotm(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - sparam_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); - sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); -} - -// SWAP -void cblas_sswap(const int n, - float* x, const int x_inc, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dswap(const int n, - double* x, const int x_inc, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_cswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zswap(const int n, - void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Swap(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// SCAL -void cblas_sscal(const int n, - const float alpha, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dscal(const int n, - const double alpha, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_cscal(const int n, - const void* alpha, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_zscal(const int n, - const void* alpha, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Scal(n, - alpha_cpp, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// COPY -void cblas_scopy(const int n, - const float* x, const int x_inc, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dcopy(const int n, - const double* x, const int x_inc, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_ccopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zcopy(const int n, - const void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Copy(n, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// AXPY -void cblas_saxpy(const int n, - const float alpha, - const float* x, const int x_inc, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_daxpy(const int n, - const double alpha, - const double* x, const int x_inc, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_caxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zaxpy(const int n, - const void* alpha, - const void* x, const int x_inc, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Axpy(n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// DOT -void cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} -void cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dot(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} - -// DOTU -void cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} -void cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dotu(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} - -// DOTC -void cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} -void cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; - const auto dot_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto dot_buffer = clblast::Buffer(context, dot_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); - auto queue_cl = queue(); - auto s = clblast::Dotc(n, - dot_buffer(), 0, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); -} - -// NRM2 -void cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto nrm2_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto nrm2_buffer = clblast::Buffer(context, nrm2_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); - auto queue_cl = queue(); - auto s = clblast::Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); -} -void cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto nrm2_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto nrm2_buffer = clblast::Buffer(context, nrm2_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); - auto queue_cl = queue(); - auto s = clblast::Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); -} -void cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto nrm2_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto nrm2_buffer = clblast::Buffer(context, nrm2_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); - auto queue_cl = queue(); - auto s = clblast::Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); -} -void cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto nrm2_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto nrm2_buffer = clblast::Buffer(context, nrm2_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); - auto queue_cl = queue(); - auto s = clblast::Nrm2(n, - nrm2_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); -} - -// ASUM -void cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto asum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto asum_buffer = clblast::Buffer(context, asum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); - auto queue_cl = queue(); - auto s = clblast::Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); -} -void cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto asum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto asum_buffer = clblast::Buffer(context, asum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); - auto queue_cl = queue(); - auto s = clblast::Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); -} -void cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto asum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto asum_buffer = clblast::Buffer(context, asum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); - auto queue_cl = queue(); - auto s = clblast::Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); -} -void cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto asum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto asum_buffer = clblast::Buffer(context, asum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); - auto queue_cl = queue(); - auto s = clblast::Asum(n, - asum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); -} - -// SUM -void cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto sum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto sum_buffer = clblast::Buffer(context, sum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); - auto queue_cl = queue(); - auto s = clblast::Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); -} -void cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto sum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto sum_buffer = clblast::Buffer(context, sum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); - auto queue_cl = queue(); - auto s = clblast::Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); -} -void cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto sum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto sum_buffer = clblast::Buffer(context, sum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); - auto queue_cl = queue(); - auto s = clblast::Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); -} -void cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto sum_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto sum_buffer = clblast::Buffer(context, sum_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); - auto queue_cl = queue(); - auto s = clblast::Sum(n, - sum_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); -} - -// AMAX -void cblas_isamax(const int n, - float* imax, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_izamax(const int n, - void* imax, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Amax(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} - -// MAX -void cblas_ismax(const int n, - float* imax, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_idmax(const int n, - double* imax, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_icmax(const int n, - void* imax, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} -void cblas_izmax(const int n, - void* imax, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imax_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); - auto queue_cl = queue(); - auto s = clblast::Max(n, - imax_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); -} - -// MIN -void cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imin_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); - auto queue_cl = queue(); - auto s = clblast::Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); -} -void cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imin_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); - auto queue_cl = queue(); - auto s = clblast::Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); -} -void cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imin_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); - auto queue_cl = queue(); - auto s = clblast::Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); -} -void cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto imin_size = 1; - auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); - auto queue_cl = queue(); - auto s = clblast::Min(n, - imin_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); -} - -// ================================================================================================= -// BLAS level-2 (matrix-vector) routines -// ================================================================================================= - -// GEMV -void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gemv(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// GBMV -void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, const int kl, const int ku, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; - const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Gbmv(static_cast(layout), - static_cast(a_transpose), - m, n, kl, ku, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// HEMV -void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hemv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// HBMV -void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// HPMV -void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* ap, - const void* x, const int x_inc, - const void* beta, - void* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Hpmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// SYMV -void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Symv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// SBMV -void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Sbmv(static_cast(layout), - static_cast(triangle), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// SPMV -void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* ap, - const float* x, const int x_inc, - const float beta, - float* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} -void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* ap, - const double* x, const int x_inc, - const double beta, - double* y, const int y_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - auto queue_cl = queue(); - auto s = clblast::Spmv(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - beta_cpp, - y_buffer(), 0, y_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - y_buffer.Read(queue, y_size, reinterpret_cast(y)); -} - -// TRMV -void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// TBMV -void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// TPMV -void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpmv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// TRSV -void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* a, const int a_ld, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* a, const int a_ld, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Trsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// TBSV -void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const float* a, const int a_ld, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const double* a, const int a_ld, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, const int k, - const void* a, const int a_ld, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto a_size = n * a_ld; - const auto x_size = n * x_inc; - auto a_buffer = clblast::Buffer(context, a_size); - auto x_buffer = clblast::Buffer(context, x_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tbsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, k, - a_buffer(), 0, a_ld, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// TPSV -void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const float* ap, - float* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const double* ap, - double* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} -void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int n, - const void* ap, - void* x, const int x_inc) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto ap_size = ((n*(n+1)) / 2); - const auto x_size = n * x_inc; - auto ap_buffer = clblast::Buffer(context, ap_size); - auto x_buffer = clblast::Buffer(context, x_size); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - auto queue_cl = queue(); - auto s = clblast::Tpsv(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - n, - ap_buffer(), 0, - x_buffer(), 0, x_inc, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - x_buffer.Read(queue, x_size, reinterpret_cast(x)); -} - -// GER -void cblas_sger(const CLBlastLayout layout, - const int m, const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_dger(const CLBlastLayout layout, - const int m, const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Ger(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// GERU -void cblas_cgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_zgeru(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Geru(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// GERC -void cblas_cgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_zgerc(const CLBlastLayout layout, - const int m, const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = m * x_inc; - const auto y_size = n * y_inc; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Gerc(static_cast(layout), - m, n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// HER -void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Her(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// HPR -void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const void* x, const int x_inc, - void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} -void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const void* x, const int x_inc, - void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Hpr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} - -// HER2 -void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Her2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// HPR2 -void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} -void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const void* alpha, - const void* x, const int x_inc, - const void* y, const int y_inc, - void* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Hpr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} - -// SYR -void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Syr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// SPR -void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} -void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Spr(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} - -// SYR2 -void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} -void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* a, const int a_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto a_size = n * a_ld; - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto a_buffer = clblast::Buffer(context, a_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - auto queue_cl = queue(); - auto s = clblast::Syr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - a_buffer(), 0, a_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - a_buffer.Read(queue, a_size, reinterpret_cast(a)); -} - -// SPR2 -void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const float alpha, - const float* x, const int x_inc, - const float* y, const int y_inc, - float* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} -void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, - const int n, - const double alpha, - const double* x, const int x_inc, - const double* y, const int y_inc, - double* ap) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto x_size = n * x_inc; - const auto y_size = n * y_inc; - const auto ap_size = ((n*(n+1)) / 2); - auto x_buffer = clblast::Buffer(context, x_size); - auto y_buffer = clblast::Buffer(context, y_size); - auto ap_buffer = clblast::Buffer(context, ap_size); - x_buffer.Write(queue, x_size, reinterpret_cast(x)); - y_buffer.Write(queue, y_size, reinterpret_cast(y)); - ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); - auto queue_cl = queue(); - auto s = clblast::Spr2(static_cast(layout), - static_cast(triangle), - n, - alpha_cpp, - x_buffer(), 0, x_inc, - y_buffer(), 0, y_inc, - ap_buffer(), 0, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); -} - -// ================================================================================================= -// BLAS level-3 (matrix-matrix) routines -// ================================================================================================= - -// GEMM -void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, - const int m, const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Gemm(static_cast(layout), - static_cast(a_transpose), - static_cast(b_transpose), - m, n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// SYMM -void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Symm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// HEMM -void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; - const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Hemm(static_cast(layout), - static_cast(side), - static_cast(triangle), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// SYRK -void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float beta, - float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double beta, - double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syrk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// HERK -void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const float alpha, - const void* a, const int a_ld, - const float beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, - const int n, const int k, - const double alpha, - const void* a, const int a_ld, - const double beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Herk(static_cast(layout), - static_cast(triangle), - static_cast(a_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// SYR2K -void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const float alpha, - const float* a, const int a_ld, - const float* b, const int b_ld, - const float beta, - float* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const double alpha, - const double* a, const int a_ld, - const double* b, const int b_ld, - const double beta, - double* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const void* beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Syr2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// HER2K -void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const float beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} -void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, - const int n, const int k, - const void* alpha, - const void* a, const int a_ld, - const void* b, const int b_ld, - const double beta, - void* c, const int c_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto beta_cpp = beta; - const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; - const auto c_size = n * c_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - auto c_buffer = clblast::Buffer(context, c_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - c_buffer.Write(queue, c_size, reinterpret_cast(c)); - auto queue_cl = queue(); - auto s = clblast::Her2k(static_cast(layout), - static_cast(triangle), - static_cast(ab_transpose), - n, k, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - beta_cpp, - c_buffer(), 0, c_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - c_buffer.Read(queue, c_size, reinterpret_cast(c)); -} - -// TRMM -void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trmm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} - -// TRSM -void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; - const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Trsm(static_cast(layout), - static_cast(side), - static_cast(triangle), - static_cast(a_transpose), - static_cast(diagonal), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} - -// ================================================================================================= -// Extra non-BLAS routines (level-X) -// ================================================================================================= - -// OMATCOPY -void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const float alpha, - const float* a, const int a_ld, - float* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const double alpha, - const double* a, const int a_ld, - double* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = alpha; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} -void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, - const int m, const int n, - const void* alpha, - const void* a, const int a_ld, - void* b, const int b_ld) { - auto device = get_device(); - auto context = clblast::Context(device); - auto queue = clblast::Queue(context, device); - const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; - const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; - auto a_buffer = clblast::Buffer(context, a_size); - auto b_buffer = clblast::Buffer(context, b_size); - a_buffer.Write(queue, a_size, reinterpret_cast(a)); - b_buffer.Write(queue, b_size, reinterpret_cast(b)); - auto queue_cl = queue(); - auto s = clblast::Omatcopy(static_cast(layout), - static_cast(a_transpose), - m, n, - alpha_cpp, - a_buffer(), 0, a_ld, - b_buffer(), 0, b_ld, - &queue_cl); - if (s != clblast::StatusCode::kSuccess) { - throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); - } - b_buffer.Read(queue, b_size, reinterpret_cast(b)); -} - -// ================================================================================================= diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp new file mode 100644 index 00000000..203a3423 --- /dev/null +++ b/src/clblast_netlib_c.cpp @@ -0,0 +1,4650 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the Netlib CBLAS implementations to the CLBlast BLAS routines, performing buffer +// copies automatically and running on the default OpenCL platform and device. For full control over +// performance, it is advised to use the regular clblast.h or clblast_c.h headers instead. +// +// ================================================================================================= + +#include + +#include "clblast_netlib_c.h" +#include "clblast.h" +#include "utilities/utilities.hpp" + +// Shortcuts to the clblast namespace +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Helper function to get a default OpenCL platform and device +clblast::Device get_device() { + auto platform_id = clblast::ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}); + auto device_id = clblast::ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}); + auto platform = clblast::Platform(platform_id); + return clblast::Device(platform, device_id); +} + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// ROTG +void cblas_srotg(float* sa, + float* sb, + float* sc, + float* ss) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto sa_size = 1; + const auto sb_size = 1; + const auto sc_size = 1; + const auto ss_size = 1; + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} +void cblas_drotg(double* sa, + double* sb, + double* sc, + double* ss) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto sa_size = 1; + const auto sb_size = 1; + const auto sc_size = 1; + const auto ss_size = 1; + auto sa_buffer = clblast::Buffer(context, sa_size); + auto sb_buffer = clblast::Buffer(context, sb_size); + auto sc_buffer = clblast::Buffer(context, sc_size); + auto ss_buffer = clblast::Buffer(context, ss_size); + sa_buffer.Write(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Write(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Write(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Write(queue, ss_size, reinterpret_cast(ss)); + auto queue_cl = queue(); + auto s = clblast::Rotg(sa_buffer(), 0, + sb_buffer(), 0, + sc_buffer(), 0, + ss_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sa_buffer.Read(queue, sa_size, reinterpret_cast(sa)); + sb_buffer.Read(queue, sb_size, reinterpret_cast(sb)); + sc_buffer.Read(queue, sc_size, reinterpret_cast(sc)); + ss_buffer.Read(queue, ss_size, reinterpret_cast(ss)); +} + +// ROTMG +void cblas_srotmg(float* sd1, + float* sd2, + float* sx1, + const float* sy1, + float* sparam) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto sy1_size = 1; + const auto sd1_size = 1; + const auto sd2_size = 1; + const auto sx1_size = 1; + const auto sparam_size = 1; + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotmg(double* sd1, + double* sd2, + double* sx1, + const double* sy1, + double* sparam) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto sy1_size = 1; + const auto sd1_size = 1; + const auto sd2_size = 1; + const auto sx1_size = 1; + const auto sparam_size = 1; + auto sy1_buffer = clblast::Buffer(context, sy1_size); + auto sd1_buffer = clblast::Buffer(context, sd1_size); + auto sd2_buffer = clblast::Buffer(context, sd2_size); + auto sx1_buffer = clblast::Buffer(context, sx1_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = clblast::Rotmg(sd1_buffer(), 0, + sd2_buffer(), 0, + sx1_buffer(), 0, + sy1_buffer(), 0, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sd1_buffer.Read(queue, sd1_size, reinterpret_cast(sd1)); + sd2_buffer.Read(queue, sd2_size, reinterpret_cast(sd2)); + sx1_buffer.Read(queue, sx1_size, reinterpret_cast(sx1)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// ROT +void cblas_srot(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + const float cos, + const float sin) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_drot(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + const double cos, + const double sin) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Rot(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + cos, + sin, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// ROTM +void cblas_srotm(const int n, + float* x, const int x_inc, + float* y, const int y_inc, + float* sparam) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto sparam_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} +void cblas_drotm(const int n, + double* x, const int x_inc, + double* y, const int y_inc, + double* sparam) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto sparam_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto sparam_buffer = clblast::Buffer(context, sparam_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Write(queue, sparam_size, reinterpret_cast(sparam)); + auto queue_cl = queue(); + auto s = clblast::Rotm(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + sparam_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); + sparam_buffer.Read(queue, sparam_size, reinterpret_cast(sparam)); +} + +// SWAP +void cblas_sswap(const int n, + float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dswap(const int n, + double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zswap(const int n, + void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Swap(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SCAL +void cblas_sscal(const int n, + const float alpha, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dscal(const int n, + const double alpha, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_cscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_zscal(const int n, + const void* alpha, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Scal(n, + alpha_cpp, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// COPY +void cblas_scopy(const int n, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dcopy(const int n, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_ccopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zcopy(const int n, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Copy(n, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// AXPY +void cblas_saxpy(const int n, + const float alpha, + const float* x, const int x_inc, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_daxpy(const int n, + const double alpha, + const double* x, const int x_inc, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_caxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zaxpy(const int n, + const void* alpha, + const void* x, const int x_inc, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n; + const auto y_size = n; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Axpy(n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// DOT +void cblas_sdot(const int n, + float* dot, + const float* x, const int x_inc, + const float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_ddot(const int n, + double* dot, + const double* x, const int x_inc, + const double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dot(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTU +void cblas_cdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotu(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dotu(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// DOTC +void cblas_cdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} +void cblas_zdotc(const int n, + void* dot, + const void* x, const int x_inc, + const void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto y_size = n; + const auto dot_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto dot_buffer = clblast::Buffer(context, dot_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); + auto queue_cl = queue(); + auto s = clblast::Dotc(n, + dot_buffer(), 0, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); +} + +// NRM2 +void cblas_snrm2(const int n, + float* nrm2, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto nrm2_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dnrm2(const int n, + double* nrm2, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto nrm2_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_scnrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto nrm2_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} +void cblas_dznrm2(const int n, + void* nrm2, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto nrm2_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto nrm2_buffer = clblast::Buffer(context, nrm2_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); + auto queue_cl = queue(); + auto s = clblast::Nrm2(n, + nrm2_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); +} + +// ASUM +void cblas_sasum(const int n, + float* asum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto asum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dasum(const int n, + double* asum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto asum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_scasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto asum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} +void cblas_dzasum(const int n, + void* asum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto asum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto asum_buffer = clblast::Buffer(context, asum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); + auto queue_cl = queue(); + auto s = clblast::Asum(n, + asum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); +} + +// SUM +void cblas_ssum(const int n, + float* sum, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto sum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dsum(const int n, + double* sum, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto sum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_scsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto sum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} +void cblas_dzsum(const int n, + void* sum, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto sum_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto sum_buffer = clblast::Buffer(context, sum_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); + auto queue_cl = queue(); + auto s = clblast::Sum(n, + sum_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); +} + +// AMAX +void cblas_isamax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idamax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izamax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Amax(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MAX +void cblas_ismax(const int n, + float* imax, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_idmax(const int n, + double* imax, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_icmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} +void cblas_izmax(const int n, + void* imax, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imax_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imax_buffer = clblast::Buffer(context, imax_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); + auto queue_cl = queue(); + auto s = clblast::Max(n, + imax_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); +} + +// MIN +void cblas_ismin(const int n, + float* imin, + const float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_idmin(const int n, + double* imin, + const double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_icmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} +void cblas_izmin(const int n, + void* imin, + const void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto x_size = n; + const auto imin_size = 1; + auto x_buffer = clblast::Buffer(context, x_size); + auto imin_buffer = clblast::Buffer(context, imin_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); + auto queue_cl = queue(); + auto s = clblast::Min(n, + imin_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// GEMV +void cblas_sgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gemv(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// GBMV +void cblas_sgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_cgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, const int kl, const int ku, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto x_size = (a_transpose != CLBlastTransposeNo) ? m * x_inc : n * x_inc; + const auto y_size = (a_transpose != CLBlastTransposeNo) ? n * y_inc : m * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Gbmv(static_cast(layout), + static_cast(a_transpose), + m, n, kl, ku, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HEMV +void cblas_chemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhemv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hemv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HBMV +void cblas_chbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// HPMV +void cblas_chpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_zhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* ap, + const void* x, const int x_inc, + const void* beta, + void* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Hpmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SYMV +void cblas_ssymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsymv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Symv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SBMV +void cblas_ssbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Sbmv(static_cast(layout), + static_cast(triangle), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// SPMV +void cblas_sspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* ap, + const float* x, const int x_inc, + const float beta, + float* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} +void cblas_dspmv(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* ap, + const double* x, const int x_inc, + const double beta, + double* y, const int y_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + auto queue_cl = queue(); + auto s = clblast::Spmv(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + beta_cpp, + y_buffer(), 0, y_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + y_buffer.Read(queue, y_size, reinterpret_cast(y)); +} + +// TRMV +void cblas_strmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBMV +void cblas_stbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPMV +void cblas_stpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpmv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TRSV +void cblas_strsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Trsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TBSV +void cblas_stbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const float* a, const int a_ld, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const double* a, const int a_ld, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztbsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, const int k, + const void* a, const int a_ld, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto a_size = n * a_ld; + const auto x_size = n * x_inc; + auto a_buffer = clblast::Buffer(context, a_size); + auto x_buffer = clblast::Buffer(context, x_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tbsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, k, + a_buffer(), 0, a_ld, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// TPSV +void cblas_stpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const float* ap, + float* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_dtpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const double* ap, + double* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ctpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} +void cblas_ztpsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int n, + const void* ap, + void* x, const int x_inc) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto ap_size = ((n*(n+1)) / 2); + const auto x_size = n * x_inc; + auto ap_buffer = clblast::Buffer(context, ap_size); + auto x_buffer = clblast::Buffer(context, x_size); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + auto queue_cl = queue(); + auto s = clblast::Tpsv(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + n, + ap_buffer(), 0, + x_buffer(), 0, x_inc, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + x_buffer.Read(queue, x_size, reinterpret_cast(x)); +} + +// GER +void cblas_sger(const CLBlastLayout layout, + const int m, const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dger(const CLBlastLayout layout, + const int m, const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Ger(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERU +void cblas_cgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgeru(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Geru(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// GERC +void cblas_cgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zgerc(const CLBlastLayout layout, + const int m, const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = m * x_inc; + const auto y_size = n * y_inc; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Gerc(static_cast(layout), + m, n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HER +void cblas_cher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Her(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR +void cblas_chpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const void* x, const int x_inc, + void* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Hpr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// HER2 +void cblas_cher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_zher2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Her2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// HPR2 +void cblas_chpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_zhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const void* alpha, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Hpr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR +void cblas_ssyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Syr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR +void cblas_sspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + float* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + double* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Spr(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// SYR2 +void cblas_ssyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} +void cblas_dsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* a, const int a_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto a_size = n * a_ld; + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto a_buffer = clblast::Buffer(context, a_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + auto queue_cl = queue(); + auto s = clblast::Syr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + a_buffer(), 0, a_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + a_buffer.Read(queue, a_size, reinterpret_cast(a)); +} + +// SPR2 +void cblas_sspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const float alpha, + const float* x, const int x_inc, + const float* y, const int y_inc, + float* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} +void cblas_dspr2(const CLBlastLayout layout, const CLBlastTriangle triangle, + const int n, + const double alpha, + const double* x, const int x_inc, + const double* y, const int y_inc, + double* ap) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; + const auto ap_size = ((n*(n+1)) / 2); + auto x_buffer = clblast::Buffer(context, x_size); + auto y_buffer = clblast::Buffer(context, y_size); + auto ap_buffer = clblast::Buffer(context, ap_size); + x_buffer.Write(queue, x_size, reinterpret_cast(x)); + y_buffer.Write(queue, y_size, reinterpret_cast(y)); + ap_buffer.Write(queue, ap_size, reinterpret_cast(ap)); + auto queue_cl = queue(); + auto s = clblast::Spr2(static_cast(layout), + static_cast(triangle), + n, + alpha_cpp, + x_buffer(), 0, x_inc, + y_buffer(), 0, y_inc, + ap_buffer(), 0, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + ap_buffer.Read(queue, ap_size, reinterpret_cast(ap)); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// GEMM +void cblas_sgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_cgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, + const int m, const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? m * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && b_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && b_transpose == CLBlastTransposeNo)) ? k * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Gemm(static_cast(layout), + static_cast(a_transpose), + static_cast(b_transpose), + m, n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYMM +void cblas_ssymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Symm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HEMM +void cblas_chemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : ((side == CLBlastSideLeft) ? m : n) * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? ((side == CLBlastSideLeft) ? m : n) * b_ld : n * b_ld; + const auto c_size = (layout == CLBlastLayoutRowMajor) ? m * c_ld : n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Hemm(static_cast(layout), + static_cast(side), + static_cast(triangle), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYRK +void cblas_ssyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syrk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HERK +void cblas_cherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const float alpha, + const void* a, const int a_ld, + const float beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, + const int n, const int k, + const double alpha, + const void* a, const int a_ld, + const double beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Herk(static_cast(layout), + static_cast(triangle), + static_cast(a_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// SYR2K +void cblas_ssyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const float alpha, + const float* a, const int a_ld, + const float* b, const int b_ld, + const float beta, + float* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_dsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const double alpha, + const double* a, const int a_ld, + const double* b, const int b_ld, + const double beta, + double* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_csyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = float2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const void* beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = double2{reinterpret_cast(beta)[0], reinterpret_cast(beta)[1]}; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Syr2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// HER2K +void cblas_cher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const float beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} +void cblas_zher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose, + const int n, const int k, + const void* alpha, + const void* a, const int a_ld, + const void* b, const int b_ld, + const double beta, + void* c, const int c_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto beta_cpp = beta; + const auto a_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * a_ld : k * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && ab_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && ab_transpose == CLBlastTransposeNo)) ? n * b_ld : k * b_ld; + const auto c_size = n * c_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + auto c_buffer = clblast::Buffer(context, c_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + c_buffer.Write(queue, c_size, reinterpret_cast(c)); + auto queue_cl = queue(); + auto s = clblast::Her2k(static_cast(layout), + static_cast(triangle), + static_cast(ab_transpose), + n, k, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + beta_cpp, + c_buffer(), 0, c_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + c_buffer.Read(queue, c_size, reinterpret_cast(c)); +} + +// TRMM +void cblas_strmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trmm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// TRSM +void cblas_strsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_dtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ctrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_ztrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (side == CLBlastSideLeft) ? m * a_ld : n * a_ld; + const auto b_size = (layout == CLBlastLayoutRowMajor) ? m * b_ld : n * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Trsm(static_cast(layout), + static_cast(side), + static_cast(triangle), + static_cast(a_transpose), + static_cast(diagonal), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= +// Extra non-BLAS routines (level-X) +// ================================================================================================= + +// OMATCOPY +void cblas_somatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const float alpha, + const float* a, const int a_ld, + float* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_domatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const double alpha, + const double* a, const int a_ld, + double* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = alpha; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_comatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} +void cblas_zomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose, + const int m, const int n, + const void* alpha, + const void* a, const int a_ld, + void* b, const int b_ld) { + auto device = get_device(); + auto context = clblast::Context(device); + auto queue = clblast::Queue(context, device); + const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; + const auto a_size = (layout == CLBlastLayoutRowMajor) ? m * a_ld : n * a_ld; + const auto b_size = ((layout == CLBlastLayoutColMajor && a_transpose != CLBlastTransposeNo) || (layout == CLBlastLayoutRowMajor && a_transpose == CLBlastTransposeNo)) ? n * b_ld : m * b_ld; + auto a_buffer = clblast::Buffer(context, a_size); + auto b_buffer = clblast::Buffer(context, b_size); + a_buffer.Write(queue, a_size, reinterpret_cast(a)); + b_buffer.Write(queue, b_size, reinterpret_cast(b)); + auto queue_cl = queue(); + auto s = clblast::Omatcopy(static_cast(layout), + static_cast(a_transpose), + m, n, + alpha_cpp, + a_buffer(), 0, a_ld, + b_buffer(), 0, b_ld, + &queue_cl); + if (s != clblast::StatusCode::kSuccess) { + throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); + } + b_buffer.Read(queue, b_size, reinterpret_cast(b)); +} + +// ================================================================================================= -- cgit v1.2.3 From eefe0df43575686c6aa48a9fb6e25e27bef1af40 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 20 Nov 2016 21:36:57 +0100 Subject: Made functions with scalar-buffers as output properly return values --- include/clblast_netlib_c.h | 156 +++++++---------- scripts/generator/generator/cpp.py | 27 ++- scripts/generator/generator/routine.py | 18 +- src/clblast_netlib_c.cpp | 300 ++++++++++++++++----------------- 4 files changed, 249 insertions(+), 252 deletions(-) (limited to 'scripts') diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index c233646e..0a38abb2 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -155,118 +155,88 @@ void PUBLIC_API cblas_zaxpy(const int n, void* y, const int y_inc); // Dot product of two vectors: SDOT/DDOT/HDOT -void PUBLIC_API cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc); -void PUBLIC_API cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc); +float PUBLIC_API cblas_sdot(const int n, + const float* x, const int x_inc, + const float* y, const int y_inc); +double PUBLIC_API cblas_ddot(const int n, + const double* x, const int x_inc, + const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU -void PUBLIC_API cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); +float PUBLIC_API cblas_cdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); +double PUBLIC_API cblas_zdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -void PUBLIC_API cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); -void PUBLIC_API cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc); +float PUBLIC_API cblas_cdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); +double PUBLIC_API cblas_zdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 -void PUBLIC_API cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc); -void PUBLIC_API cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc); -void PUBLIC_API cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc); -void PUBLIC_API cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc); +float PUBLIC_API cblas_snrm2(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dnrm2(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scnrm2(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dznrm2(const int n, + const void* x, const int x_inc); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM -void PUBLIC_API cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc); +float PUBLIC_API cblas_sasum(const int n, + const float* x, const int x_inc); +double PUBLIC_API cblas_dasum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scasum(const int n, + const void* x, const int x_inc); +double PUBLIC_API cblas_dzasum(const int n, + const void* x, const int x_inc); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM -void PUBLIC_API cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc); -void PUBLIC_API cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc); -void PUBLIC_API cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc); -void PUBLIC_API cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc); - -// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX -void PUBLIC_API cblas_isamax(const int n, - float* imax, +float PUBLIC_API cblas_ssum(const int n, const float* x, const int x_inc); -void PUBLIC_API cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc); -void PUBLIC_API cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc); -void PUBLIC_API cblas_izamax(const int n, - void* imax, +double PUBLIC_API cblas_dsum(const int n, + const double* x, const int x_inc); +float PUBLIC_API cblas_scsum(const int n, const void* x, const int x_inc); +double PUBLIC_API cblas_dzsum(const int n, + const void* x, const int x_inc); -// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX -void PUBLIC_API cblas_ismax(const int n, - float* imax, +// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX +int PUBLIC_API cblas_isamax(const int n, const float* x, const int x_inc); -void PUBLIC_API cblas_idmax(const int n, - double* imax, +int PUBLIC_API cblas_idamax(const int n, const double* x, const int x_inc); -void PUBLIC_API cblas_icmax(const int n, - void* imax, +int PUBLIC_API cblas_icamax(const int n, const void* x, const int x_inc); -void PUBLIC_API cblas_izmax(const int n, - void* imax, +int PUBLIC_API cblas_izamax(const int n, const void* x, const int x_inc); +// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX +int PUBLIC_API cblas_ismax(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmax(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmax(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmax(const int n, + const void* x, const int x_inc); + // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN -void PUBLIC_API cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc); -void PUBLIC_API cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc); -void PUBLIC_API cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc); -void PUBLIC_API cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc); +int PUBLIC_API cblas_ismin(const int n, + const float* x, const int x_inc); +int PUBLIC_API cblas_idmin(const int n, + const double* x, const int x_inc); +int PUBLIC_API cblas_icmin(const int n, + const void* x, const int x_inc); +int PUBLIC_API cblas_izmin(const int n, + const void* x, const int x_inc); // ================================================================================================= // BLAS level-2 (matrix-vector) routines diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 9d4ef6c4..7b7ece22 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -100,7 +100,7 @@ def clblast_netlib_c_h(routine): result = NL + "// " + routine.description + ": " + routine.short_names() + NL for flavour in routine.flavours: if flavour.precision_name in ["S", "D", "C", "Z"]: - result += routine.routine_header_netlib(flavour, 24, " PUBLIC_API") + ";" + NL + result += routine.routine_header_netlib(flavour, 20, " PUBLIC_API") + ";" + NL return result @@ -113,7 +113,7 @@ def clblast_netlib_c_cc(routine): if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" indent = " " * (21 + routine.length() + len(template)) - result += routine.routine_header_netlib(flavour, 13, "") + " {" + NL + result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL # Initialize OpenCL result += " auto device = get_device();" + NL @@ -127,10 +127,13 @@ def clblast_netlib_c_cc(routine): for i, name in enumerate(routine.inputs + routine.outputs): result += " " + routine.set_size(name, routine.buffer_sizes[i]) + NL for i, name in enumerate(routine.inputs + routine.outputs): - result += " " + routine.create_buffer(name, flavour.buffer_type) + NL + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.create_buffer(name, buffer_type) + NL for name in routine.inputs + routine.outputs: - prefix = "" if name in routine.outputs else "const " - result += " " + routine.write_buffer(name, prefix + flavour.buffer_type) + NL + if name not in routine.scalar_buffers_first(): + prefix = "" if name in routine.outputs else "const " + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.write_buffer(name, prefix + buffer_type) + NL # The function call result += " auto queue_cl = queue();" + NL @@ -145,7 +148,19 @@ def clblast_netlib_c_cc(routine): # Copy back and clean-up for name in routine.outputs: - result += " " + routine.read_buffer(name, flavour.buffer_type) + NL + if name in routine.scalar_buffers_first(): + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL + for name in routine.outputs: + buffer_type = routine.get_buffer_type(name, flavour) + result += " " + routine.read_buffer(name, buffer_type) + NL + for name in routine.outputs: + if name in routine.scalar_buffers_first(): + result += " return " + name + "[0]" + if flavour.buffer_type in ["float2", "double2"]: + if name not in routine.index_buffers(): + result += ".real()" + result += ";" + NL result += "}" + NL return result diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 097376ad..391cf3e0 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -109,6 +109,11 @@ class Routine: """List of buffers without 'inc' or 'ld'""" return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"] + def get_buffer_type(self, name, flavour): + if name in self.index_buffers(): + return "int" + return flavour.buffer_type + def length(self): """Retrieves the number of characters in the routine's name""" return len(self.name) @@ -549,7 +554,6 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" return (self.options_def_c() + self.sizes_def_netlib() + - list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + @@ -645,8 +649,16 @@ class Routine: def routine_header_netlib(self, flavour, spaces, extra_qualifier): """As above, but now for the original Netlib CBLAS API""" - indent = " " * (spaces + self.length()) - result = "void" + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + return_type = "void" + for output in self.outputs: + if output in self.index_buffers(): + return_type = "int" + break + if output in self.scalar_buffers_first(): + return_type = flavour.buffer_type.replace("2", "") + break + indent = " " * (spaces + len(return_type) + self.length()) + result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 203a3423..efff1712 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -666,10 +666,9 @@ void cblas_zaxpy(const int n, } // DOT -void cblas_sdot(const int n, - float* dot, - const float* x, const int x_inc, - const float* y, const int y_inc) { +float cblas_sdot(const int n, + const float* x, const int x_inc, + const float* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -681,7 +680,6 @@ void cblas_sdot(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, @@ -691,12 +689,13 @@ void cblas_sdot(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0]; } -void cblas_ddot(const int n, - double* dot, - const double* x, const int x_inc, - const double* y, const int y_inc) { +double cblas_ddot(const int n, + const double* x, const int x_inc, + const double* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -708,7 +707,6 @@ void cblas_ddot(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dot(n, dot_buffer(), 0, @@ -718,14 +716,15 @@ void cblas_ddot(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0]; } // DOTU -void cblas_cdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +float cblas_cdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -737,7 +736,6 @@ void cblas_cdotu(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, @@ -747,12 +745,13 @@ void cblas_cdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } -void cblas_zdotu(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +double cblas_zdotu(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -764,7 +763,6 @@ void cblas_zdotu(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotu(n, dot_buffer(), 0, @@ -774,14 +772,15 @@ void cblas_zdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } // DOTC -void cblas_cdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +float cblas_cdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -793,7 +792,6 @@ void cblas_cdotc(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, @@ -803,12 +801,13 @@ void cblas_cdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } -void cblas_zdotc(const int n, - void* dot, - const void* x, const int x_inc, - const void* y, const int y_inc) { +double cblas_zdotc(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -820,7 +819,6 @@ void cblas_zdotc(const int n, auto dot_buffer = clblast::Buffer(context, dot_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); y_buffer.Write(queue, y_size, reinterpret_cast(y)); - dot_buffer.Write(queue, dot_size, reinterpret_cast(dot)); auto queue_cl = queue(); auto s = clblast::Dotc(n, dot_buffer(), 0, @@ -830,13 +828,14 @@ void cblas_zdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); + return dot[0].real(); } // NRM2 -void cblas_snrm2(const int n, - float* nrm2, - const float* x, const int x_inc) { +float cblas_snrm2(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -845,7 +844,6 @@ void cblas_snrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -854,11 +852,12 @@ void cblas_snrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0]; } -void cblas_dnrm2(const int n, - double* nrm2, - const double* x, const int x_inc) { +double cblas_dnrm2(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -867,7 +866,6 @@ void cblas_dnrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -876,11 +874,12 @@ void cblas_dnrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0]; } -void cblas_scnrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { +float cblas_scnrm2(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -889,7 +888,6 @@ void cblas_scnrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -898,11 +896,12 @@ void cblas_scnrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0].real(); } -void cblas_dznrm2(const int n, - void* nrm2, - const void* x, const int x_inc) { +double cblas_dznrm2(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -911,7 +910,6 @@ void cblas_dznrm2(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - nrm2_buffer.Write(queue, nrm2_size, reinterpret_cast(nrm2)); auto queue_cl = queue(); auto s = clblast::Nrm2(n, nrm2_buffer(), 0, @@ -920,13 +918,14 @@ void cblas_dznrm2(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 nrm2[nrm2_size]; nrm2_buffer.Read(queue, nrm2_size, reinterpret_cast(nrm2)); + return nrm2[0].real(); } // ASUM -void cblas_sasum(const int n, - float* asum, - const float* x, const int x_inc) { +float cblas_sasum(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -935,7 +934,6 @@ void cblas_sasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -944,11 +942,12 @@ void cblas_sasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0]; } -void cblas_dasum(const int n, - double* asum, - const double* x, const int x_inc) { +double cblas_dasum(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -957,7 +956,6 @@ void cblas_dasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -966,11 +964,12 @@ void cblas_dasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0]; } -void cblas_scasum(const int n, - void* asum, - const void* x, const int x_inc) { +float cblas_scasum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -979,7 +978,6 @@ void cblas_scasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -988,11 +986,12 @@ void cblas_scasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0].real(); } -void cblas_dzasum(const int n, - void* asum, - const void* x, const int x_inc) { +double cblas_dzasum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1001,7 +1000,6 @@ void cblas_dzasum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - asum_buffer.Write(queue, asum_size, reinterpret_cast(asum)); auto queue_cl = queue(); auto s = clblast::Asum(n, asum_buffer(), 0, @@ -1010,13 +1008,14 @@ void cblas_dzasum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 asum[asum_size]; asum_buffer.Read(queue, asum_size, reinterpret_cast(asum)); + return asum[0].real(); } // SUM -void cblas_ssum(const int n, - float* sum, - const float* x, const int x_inc) { +float cblas_ssum(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1025,7 +1024,6 @@ void cblas_ssum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1034,11 +1032,12 @@ void cblas_ssum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0]; } -void cblas_dsum(const int n, - double* sum, - const double* x, const int x_inc) { +double cblas_dsum(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1047,7 +1046,6 @@ void cblas_dsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1056,11 +1054,12 @@ void cblas_dsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0]; } -void cblas_scsum(const int n, - void* sum, - const void* x, const int x_inc) { +float cblas_scsum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1069,7 +1068,6 @@ void cblas_scsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1078,11 +1076,12 @@ void cblas_scsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + float2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0].real(); } -void cblas_dzsum(const int n, - void* sum, - const void* x, const int x_inc) { +double cblas_dzsum(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -1091,7 +1090,6 @@ void cblas_dzsum(const int n, auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - sum_buffer.Write(queue, sum_size, reinterpret_cast(sum)); auto queue_cl = queue(); auto s = clblast::Sum(n, sum_buffer(), 0, @@ -1100,22 +1098,22 @@ void cblas_dzsum(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } + double2 sum[sum_size]; sum_buffer.Read(queue, sum_size, reinterpret_cast(sum)); + return sum[0].real(); } // AMAX -void cblas_isamax(const int n, - float* imax, - const float* x, const int x_inc) { +int cblas_isamax(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1124,20 +1122,20 @@ void cblas_isamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_idamax(const int n, - double* imax, - const double* x, const int x_inc) { +int cblas_idamax(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1146,20 +1144,20 @@ void cblas_idamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_icamax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_icamax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1168,20 +1166,20 @@ void cblas_icamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_izamax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_izamax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Amax(n, imax_buffer(), 0, @@ -1190,22 +1188,22 @@ void cblas_izamax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } // MAX -void cblas_ismax(const int n, - float* imax, - const float* x, const int x_inc) { +int cblas_ismax(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1214,20 +1212,20 @@ void cblas_ismax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_idmax(const int n, - double* imax, - const double* x, const int x_inc) { +int cblas_idmax(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1236,20 +1234,20 @@ void cblas_idmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_icmax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_icmax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1258,20 +1256,20 @@ void cblas_icmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } -void cblas_izmax(const int n, - void* imax, - const void* x, const int x_inc) { +int cblas_izmax(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imax_buffer = clblast::Buffer(context, imax_size); + auto imax_buffer = clblast::Buffer(context, imax_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imax_buffer.Write(queue, imax_size, reinterpret_cast(imax)); auto queue_cl = queue(); auto s = clblast::Max(n, imax_buffer(), 0, @@ -1280,22 +1278,22 @@ void cblas_izmax(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + int imax[imax_size]; + imax_buffer.Read(queue, imax_size, reinterpret_cast(imax)); + return imax[0]; } // MIN -void cblas_ismin(const int n, - float* imin, - const float* x, const int x_inc) { +int cblas_ismin(const int n, + const float* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1304,20 +1302,20 @@ void cblas_ismin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_idmin(const int n, - double* imin, - const double* x, const int x_inc) { +int cblas_idmin(const int n, + const double* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1326,20 +1324,20 @@ void cblas_idmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_icmin(const int n, - void* imin, - const void* x, const int x_inc) { +int cblas_icmin(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1348,20 +1346,20 @@ void cblas_icmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } -void cblas_izmin(const int n, - void* imin, - const void* x, const int x_inc) { +int cblas_izmin(const int n, + const void* x, const int x_inc) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto x_size = n; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); - auto imin_buffer = clblast::Buffer(context, imin_size); + auto imin_buffer = clblast::Buffer(context, imin_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); - imin_buffer.Write(queue, imin_size, reinterpret_cast(imin)); auto queue_cl = queue(); auto s = clblast::Min(n, imin_buffer(), 0, @@ -1370,7 +1368,9 @@ void cblas_izmin(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + int imin[imin_size]; + imin_buffer.Read(queue, imin_size, reinterpret_cast(imin)); + return imin[0]; } // ================================================================================================= -- cgit v1.2.3 From 26ca07148092b5d4fcb0e25190e07bf6acae25a3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 22 Nov 2016 08:41:52 +0100 Subject: Minor changes to ensure full compatibility with the Netlib CBLAS API --- include/clblast_netlib_c.h | 50 ++++++++++++++++++++++++---------- scripts/generator/generator.py | 2 +- scripts/generator/generator/cpp.py | 7 +++-- scripts/generator/generator/routine.py | 33 +++++++++++++++++----- src/clblast_netlib_c.cpp | 46 +++++++++++++++---------------- 5 files changed, 90 insertions(+), 48 deletions(-) (limited to 'scripts') diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index 0a38abb2..b5577cfa 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -46,6 +46,24 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; +// For full compatibility with CBLAS +typedef CLBlastLayout CBLAS_ORDER; +typedef CLBlastTranspose CBLAS_TRANSPOSE; +typedef CLBlastTriangle CBLAS_UPLO; +typedef CLBlastDiagonal CBLAS_DIAG; +typedef CLBlastSide CBLAS_SIDE; +#define CblasRowMajor CLBlastLayoutRowMajor +#define CblasColMajor CLBlastLayoutColMajor +#define CblasNoTrans CLBlastTransposeNo +#define CblasTrans CLBlastTransposeYes +#define CblasConjTrans CLBlastTransposeConjugate +#define CblasUpper CLBlastTriangleUpper +#define CblasLower CLBlastTriangleLower +#define CblasNonUnit CLBlastDiagonalNonUnit +#define CblasUnit CLBlastDiagonalUnit +#define CblasLeft CLBlastSideLeft +#define CblasRight CLBlastSideRight + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -64,12 +82,12 @@ void PUBLIC_API cblas_drotg(double* sa, void PUBLIC_API cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam); void PUBLIC_API cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam); // Apply givens plane rotation: SROT/DROT @@ -163,20 +181,24 @@ double PUBLIC_API cblas_ddot(const int n, const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU -float PUBLIC_API cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -float PUBLIC_API cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 float PUBLIC_API cblas_snrm2(const int n, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 1a467340..5f0bb0d4 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 7b7ece22..6bb3080f 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -112,6 +112,7 @@ def clblast_netlib_c_cc(routine): # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" + name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else "" indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL @@ -129,6 +130,8 @@ def clblast_netlib_c_cc(routine): for i, name in enumerate(routine.inputs + routine.outputs): buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.create_buffer(name, buffer_type) + NL + if name in routine.scalar_buffers_second_non_pointer(): + result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL for name in routine.inputs + routine.outputs: if name not in routine.scalar_buffers_first(): prefix = "" if name in routine.outputs else "const " @@ -148,14 +151,14 @@ def clblast_netlib_c_cc(routine): # Copy back and clean-up for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): buffer_type = routine.get_buffer_type(name, flavour) result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL for name in routine.outputs: buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.read_buffer(name, buffer_type) + NL for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): result += " return " + name + "[0]" if flavour.buffer_type in ["float2", "double2"]: if name not in routine.index_buffers(): diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 391cf3e0..6fcce23b 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -42,6 +42,11 @@ class Routine: """List of scalar buffers""" return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] + @staticmethod + def scalar_buffers_second_non_pointer(): + """As above, but these ones are not passed as pointers but as scalars instead""" + return ["sy1"] + @staticmethod def other_scalars(): """List of scalars other than alpha and beta""" @@ -67,6 +72,10 @@ class Routine: """Distinguish between vectors and matrices""" return ["a", "b", "c", "ap"] + @staticmethod + def routines_scalar_no_return(): + return ["dotu", "dotc"] + @staticmethod def set_size(name, size): """Sets the size of a buffer""" @@ -77,10 +86,12 @@ class Routine: """Creates a new CLCudaAPI buffer""" return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" - @staticmethod - def write_buffer(name, template): + def write_buffer(self, name, template): """Writes to a CLCudaAPI buffer""" - data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + postfix = "" + if name in self.scalar_buffers_second_non_pointer(): + postfix = "_vec" + data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")" return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod @@ -206,7 +217,8 @@ class Routine: prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: data_type = "void" if flavour.is_non_standard() else flavour.buffer_type - a = [prefix + data_type + "* " + name + ""] + pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*" + a = [prefix + data_type + pointer + " " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] return [] @@ -553,13 +565,16 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" - return (self.options_def_c() + self.sizes_def_netlib() + + result=(self.options_def_c() + self.sizes_def_netlib() + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + if self.name in self.routines_scalar_no_return(): + result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + return result def arguments_def_c(self, flavour): """As above, but for the C API""" @@ -654,11 +669,15 @@ class Routine: if output in self.index_buffers(): return_type = "int" break - if output in self.scalar_buffers_first(): + if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return(): return_type = flavour.buffer_type.replace("2", "") break indent = " " * (spaces + len(return_type) + self.length()) - result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + routine_name = self.name + if self.name in self.routines_scalar_no_return(): + routine_name += "_sub" + indent += " " + result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index efff1712..66852e31 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -107,7 +107,7 @@ void cblas_drotg(double* sa, void cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -118,11 +118,12 @@ void cblas_srotmg(float* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); + float sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); @@ -145,7 +146,7 @@ void cblas_srotmg(float* sd1, void cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -156,11 +157,12 @@ void cblas_drotmg(double* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer(context, sy1_size); + double sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer(context, sd1_size); auto sd2_buffer = clblast::Buffer(context, sd2_size); auto sx1_buffer = clblast::Buffer(context, sx1_size); auto sparam_buffer = clblast::Buffer(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast(sx1)); @@ -722,9 +724,10 @@ double cblas_ddot(const int n, } // DOTU -float cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -745,13 +748,12 @@ float cblas_cdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } -double cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -772,15 +774,14 @@ double cblas_zdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } // DOTC -float cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -801,13 +802,12 @@ float cblas_cdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } -double cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -828,9 +828,7 @@ double cblas_zdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast(dot)); - return dot[0].real(); } // NRM2 -- cgit v1.2.3 From 792cc8359fe96dd6a53064579b18f76d9e913f98 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 23 Nov 2016 22:00:20 +0100 Subject: Fixed a vector-size related bug in the CLBlast Netlib API --- scripts/generator/generator.py | 30 ++++----- src/clblast_netlib_c.cpp | 144 ++++++++++++++++++++--------------------- 2 files changed, 87 insertions(+), 87 deletions(-) (limited to 'scripts') diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 5f0bb0d4..35d902b7 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -101,21 +101,21 @@ ROUTINES = [ [ # Level 1: vector-vector Routine(False, True, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], ["1","1","1","1"], [], "", "Generate givens plane rotation", "", []), Routine(False, True, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], ["1","1","1","1","1"], [], "", "Generate modified givens plane rotation", "", []), - Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["n","n"], ["cos","sin"],"", "Apply givens plane rotation", "", []), - Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], ["n","n","1"], [], "", "Apply modified givens plane rotation", "", []), - Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], ["n","n"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), - Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], ["n"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), - Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), - Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], ["n","n"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), - Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), - Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), - Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], ["n","n","1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), - Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], ["n","1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), - Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], ["n","1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), - Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], ["n","1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), - Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), - Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], ["n","1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), - Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], ["n","1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), + Routine(False, True, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], [xn,yn], ["cos","sin"],"", "Apply givens plane rotation", "", []), + Routine(False, True, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [xn,yn,"1"], [], "", "Apply modified givens plane rotation", "", []), + Routine(True, True, "1", "swap", T, [S,D,C,Z,H], ["n"], [], [], ["x","y"], [xn,yn], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []), + Routine(True, True, "1", "scal", T, [S,D,C,Z,H], ["n"], [], [], ["x"], [xn], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []), + Routine(True, True, "1", "copy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []), + Routine(True, True, "1", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []), + Routine(True, True, "1", "dot", T, [S,D,H], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []), + Routine(True, True, "1", "dotu", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []), + Routine(True, True, "1", "dotc", T, [C,Z], ["n"], [], ["x","y"], ["dot"], [xn,yn,"1"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []), + Routine(True, True, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []), + Routine(True, True, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []), + Routine(True, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []), + Routine(True, True, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []), + Routine(True, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []), + Routine(True, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []), ], [ # Level 2: matrix-vector Routine(True, True, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]), diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index 66852e31..3fbabd43 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -192,8 +192,8 @@ void cblas_srot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -219,8 +219,8 @@ void cblas_drot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -247,8 +247,8 @@ void cblas_srotm(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -276,8 +276,8 @@ void cblas_drotm(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto sparam_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -306,8 +306,8 @@ void cblas_sswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -329,8 +329,8 @@ void cblas_dswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -352,8 +352,8 @@ void cblas_cswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -375,8 +375,8 @@ void cblas_zswap(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -401,7 +401,7 @@ void cblas_sscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -421,7 +421,7 @@ void cblas_dscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -441,7 +441,7 @@ void cblas_cscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -461,7 +461,7 @@ void cblas_zscal(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; + const auto x_size = n * x_inc; auto x_buffer = clblast::Buffer(context, x_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); auto queue_cl = queue(); @@ -482,8 +482,8 @@ void cblas_scopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -504,8 +504,8 @@ void cblas_dcopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -526,8 +526,8 @@ void cblas_ccopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -548,8 +548,8 @@ void cblas_zcopy(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -574,8 +574,8 @@ void cblas_saxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -599,8 +599,8 @@ void cblas_daxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = alpha; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -624,8 +624,8 @@ void cblas_caxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = float2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -649,8 +649,8 @@ void cblas_zaxpy(const int n, auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); const auto alpha_cpp = double2{reinterpret_cast(alpha)[0], reinterpret_cast(alpha)[1]}; - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); x_buffer.Write(queue, x_size, reinterpret_cast(x)); @@ -674,8 +674,8 @@ float cblas_sdot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -701,8 +701,8 @@ double cblas_ddot(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -731,8 +731,8 @@ void cblas_cdotu_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -757,8 +757,8 @@ void cblas_zdotu_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -785,8 +785,8 @@ void cblas_cdotc_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -811,8 +811,8 @@ void cblas_zdotc_sub(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; - const auto y_size = n; + const auto x_size = n * x_inc; + const auto y_size = n * y_inc; const auto dot_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto y_buffer = clblast::Buffer(context, y_size); @@ -837,7 +837,7 @@ float cblas_snrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -859,7 +859,7 @@ double cblas_dnrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -881,7 +881,7 @@ float cblas_scnrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -903,7 +903,7 @@ double cblas_dznrm2(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto nrm2_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto nrm2_buffer = clblast::Buffer(context, nrm2_size); @@ -927,7 +927,7 @@ float cblas_sasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -949,7 +949,7 @@ double cblas_dasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -971,7 +971,7 @@ float cblas_scasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -993,7 +993,7 @@ double cblas_dzasum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto asum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto asum_buffer = clblast::Buffer(context, asum_size); @@ -1017,7 +1017,7 @@ float cblas_ssum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1039,7 +1039,7 @@ double cblas_dsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1061,7 +1061,7 @@ float cblas_scsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1083,7 +1083,7 @@ double cblas_dzsum(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto sum_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto sum_buffer = clblast::Buffer(context, sum_size); @@ -1107,7 +1107,7 @@ int cblas_isamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1129,7 +1129,7 @@ int cblas_idamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1151,7 +1151,7 @@ int cblas_icamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1173,7 +1173,7 @@ int cblas_izamax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1197,7 +1197,7 @@ int cblas_ismax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1219,7 +1219,7 @@ int cblas_idmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1241,7 +1241,7 @@ int cblas_icmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1263,7 +1263,7 @@ int cblas_izmax(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imax_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imax_buffer = clblast::Buffer(context, imax_size); @@ -1287,7 +1287,7 @@ int cblas_ismin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1309,7 +1309,7 @@ int cblas_idmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1331,7 +1331,7 @@ int cblas_icmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); @@ -1353,7 +1353,7 @@ int cblas_izmin(const int n, auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); - const auto x_size = n; + const auto x_size = n * x_inc; const auto imin_size = 1; auto x_buffer = clblast::Buffer(context, x_size); auto imin_buffer = clblast::Buffer(context, imin_size); -- cgit v1.2.3