From 5c83217cf256984573924e8f89c46f393a5fcfcd Mon Sep 17 00:00:00 2001 From: cnugteren Date: Fri, 1 Apr 2016 22:36:39 -0700 Subject: Added a wrapper for CBLAS libraries for performance/correctness testing --- include/clblast.h | 2 +- include/clblast_c.h | 4 +- scripts/generator/datatype.py | 5 + scripts/generator/generator.py | 53 +- scripts/generator/routine.py | 109 ++- src/clblast.cc | 6 +- src/clblast_c.cc | 4 +- test/wrapper_cblas.h | 1667 ++++++++++++++++++++++++++++++++++++++++ test/wrapper_clblas.h | 6 +- 9 files changed, 1825 insertions(+), 31 deletions(-) create mode 100644 test/wrapper_cblas.h diff --git a/include/clblast.h b/include/clblast.h index 5e5c5a46..431f2510 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -100,7 +100,7 @@ template StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event = nullptr); diff --git a/include/clblast_c.h b/include/clblast_c.h index dcb3ae3a..f72cff3a 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event); diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py index 9323bc4d..5a58ab53 100644 --- a/scripts/generator/datatype.py +++ b/scripts/generator/datatype.py @@ -58,5 +58,10 @@ class DataType(): return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp + # Current scalar is complex + def IsComplex(self, scalar): + return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or + (scalar == "beta" and self.beta_cpp in [FLT2, DBL2])) + # ================================================================================================== diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6e2b2ed2..36a9bf40 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -8,12 +8,13 @@ # Cedric Nugteren # # This script automatically generates the bodies of the following files, creating the full CLBlast -# API interface and implementation (C, C++, and clBLAS wrapper): +# API interface and implementation (C, C++, and reference BLAS wrappers): # clblast.h # clblast.cc # clblast_c.h # clblast_c.cc # wrapper_clblas.h +# wrapper_cblas.h # It also generates the main functions for the correctness and performance tests as found in # test/correctness/routines/levelX/xYYYY.cc # test/performance/routines/levelX/xYYYY.cc @@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") routines = [ [ # Level 1: vector-vector Routine(False, "1", "rotg", T, [S,D], [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"), - Routine(False, "1", "rotmg", T, [S,D], [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"), + Routine(False, "1", "rotmg", T, [S,D], [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"), Routine(False, "1", "rot", T, [S,D], ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"), Routine(False, "1", "rotm", T, [S,D], ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"), Routine(True, "1", "swap", T, [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"), @@ -220,11 +221,11 @@ def wrapper_clblas(routines): for routine in routines: result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames()) if routine.NoScalars(): - result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n" + result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" for flavour in routine.flavours: indent = " "*(17 + routine.Length()) - result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n" - arguments = routine.ArgumentsWrapper(flavour) + result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + arguments = routine.ArgumentsWrapperCL(flavour) if routine.scratch: result += " auto queue = Queue(queues[0]);\n" result += " auto context = queue.GetContext();\n" @@ -236,6 +237,41 @@ def wrapper_clblas(routines): result += "\n}\n" return result +# The wrapper to the reference CBLAS routines (for performance/correctness testing) +def wrapper_cblas(routines): + result = "" + for routine in routines: + result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames()) + for flavour in routine.flavours: + indent = " "*(10 + routine.Length()) + result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n" + arguments = routine.ArgumentsWrapperC(flavour) + + # Double-precision scalars + for scalar in routine.scalars: + if flavour.IsComplex(scalar): + result += " const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n" + + # Special case for scalar outputs + assignment = "" + postfix = "" + extra_argument = "" + for output_buffer in routine.outputs: + if output_buffer in routine.ScalarBuffersFirst(): + if flavour in [C,Z]: + postfix += "_sub" + indent += " " + extra_argument += ",\n"+indent+"reinterpret_cast(&"+output_buffer+"_buffer["+output_buffer+"_offset])" + else: + assignment = output_buffer+"_buffer["+output_buffer+"_offset] = " + indent += " "*len(assignment) + + result += " "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"(" + result += (",\n"+indent).join([a for a in arguments]) + result += extra_argument+");" + result += "\n}\n" + return result + # ================================================================================================== # Checks for the number of command-line arguments @@ -251,9 +287,10 @@ files = [ path_clblast+"/include/clblast_c.h", path_clblast+"/src/clblast_c.cc", path_clblast+"/test/wrapper_clblas.h", + path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 65, 93, 22, 22] -footer_lines = [6, 3, 9, 2, 6] +header_lines = [84, 65, 93, 22, 22, 31] +footer_lines = [6, 3, 9, 2, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise for f in files: @@ -287,6 +324,8 @@ for i in xrange(0,len(files)): body += clblast_c_cc(routines[level-1]) if i == 4: body += wrapper_clblas(routines[level-1]) + if i == 5: + body += wrapper_cblas(routines[level-1]) f.write("".join(file_header)) f.write(body) f.write("".join(file_footer)) diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index 02040583..fffa19f6 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -28,7 +28,7 @@ def OptionToCLBlast(x): }[x] # As above, but for clBLAS data-types -def OptionToWrapper(x): +def OptionToWrapperCL(x): return { 'layout': "clblasOrder", 'a_transpose': "clblasTranspose", @@ -39,6 +39,18 @@ def OptionToWrapper(x): 'diagonal': "clblasDiag", }[x] +# As above, but for CBLAS data-types +def OptionToWrapperC(x): + return { + 'layout': "CBLAS_ORDER", + 'a_transpose': "CBLAS_TRANSPOSE", + 'b_transpose': "CBLAS_TRANSPOSE", + 'ab_transpose': "CBLAS_TRANSPOSE", + 'side': "CBLAS_SIDE", + 'triangle': "CBLAS_UPLO", + 'diagonal': "CBLAS_DIAG", + }[x] + # ================================================================================================== # Class holding routine-specific information (e.g. name, which arguments, which precisions) @@ -119,6 +131,16 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but as vectors + def BufferDefVector(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but with Claduc buffers def BufferCladuc(self, name): if (name in self.inputs) or (name in self.outputs): @@ -129,7 +151,7 @@ class Routine(): return [] # As above but with a static cast for clBLAS wrapper - def BufferWrapper(self, name): + def BufferWrapperCL(self, name): if (name in self.inputs) or (name in self.outputs): a = [name+"_buffer"] b = [name+"_offset"] @@ -141,6 +163,24 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with a static cast for CBLAS wrapper + def BufferWrapperC(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + if name == "sy1": + a = [name+"_buffer["+name+"_offset]"] + elif flavour.precision_name in ["C","Z"]: + a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"] + else: + a = ["&"+name+"_buffer["+name+"_offset]"] + c = [] + if (name in ["x","y"]): + c = ["static_cast("+name+"_"+self.Postfix(name)+")"] + elif (name in ["a","b","c"]): + c = [name+"_"+self.Postfix(name)] + return [", ".join(a+c)] + return [] + # As above, but only data-types def BufferType(self, name): prefix = "const " if (name in self.inputs) else "" @@ -179,6 +219,14 @@ class Routine(): return [name] return [] + # Retrieves the use of a scalar for CBLAS (alpha/beta) + def ScalarUseWrapperC(self, name, flavour): + if name in self.scalars: + if flavour.IsComplex(name): + return [name+"_array.data()"] + return [name] + return [] + # Retrieves the definition of a scalar (alpha/beta) def ScalarDef(self, name, flavour): if name in self.scalars: @@ -246,9 +294,16 @@ class Routine(): return [] # As above, but now using clBLAS data-types - def OptionsDefWrapper(self): + def OptionsDefWrapperCL(self): + if self.options: + definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options] + return [", ".join(definitions)] + return [] + + # As above, but now using CBLAS data-types + def OptionsDefWrapperC(self): if self.options: - definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options] + definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options] return [", ".join(definitions)] return [] @@ -284,16 +339,26 @@ class Routine(): list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()]))) # As above, but for the clBLAS wrapper - def ArgumentsWrapper(self, flavour): + def ArgumentsWrapperCL(self, flavour): return (self.Options() + self.Sizes() + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) + self.ScalarUseWrapper("alpha", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) + self.ScalarUseWrapper("beta", flavour) + - list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()]))) + # As above, but for the CBLAS wrapper + def ArgumentsWrapperC(self, flavour): + return (self.Options() + self.Sizes() + + self.ScalarUseWrapperC("alpha", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarUseWrapperC("beta", flavour) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()]))) + # Retrieves a combination of all the argument definitions def ArgumentsDef(self, flavour): return (self.OptionsDef() + self.SizesDef() + @@ -306,8 +371,8 @@ class Routine(): list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()]))) # As above, but clBLAS wrapper plain datatypes - def ArgumentsDefWrapper(self, flavour): - return (self.OptionsDefWrapper() + self.SizesDef() + + def ArgumentsDefWrapperCL(self, flavour): + return (self.OptionsDefWrapperCL() + self.SizesDef() + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + @@ -315,6 +380,17 @@ class Routine(): list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) + + # As above, but CBLAS wrapper plain datatypes + def ArgumentsDefWrapperC(self, flavour): + return (self.OptionsDefWrapperC() + self.SizesDef() + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) + + self.ScalarDefPlain("alpha", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) + + self.ScalarDefPlain("beta", flavour) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # Retrieves a combination of all the argument types def ArgumentsType(self, flavour): @@ -356,7 +432,7 @@ class Routine(): return result # As above, but now for the clBLAS wrapper - def RoutineHeaderWrapper(self, flavour, def_only, spaces): + def RoutineHeaderWrapperCL(self, flavour, def_only, spaces): template = "<"+flavour.template+">" if self.NoScalars() and not def_only else "" indent = " "*(spaces + self.Length() + len(template)) result = "" @@ -366,9 +442,16 @@ class Routine(): result += flavour.name result += ">\n" result += "clblasStatus clblasX"+self.name+template+"(" - result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)]) + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)]) result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues" result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)" return result + # As above, but now for the CBLAS wrapper + def RoutineHeaderWrapperC(self, flavour, def_only, spaces): + indent = " "*(spaces + self.Length()) + result = "void cblasX"+self.name+"(" + result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")" + return result + # ================================================================================================== diff --git a/src/clblast.cc b/src/clblast.cc index fc50ffae..75893ee9 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -93,7 +93,7 @@ template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; @@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t, template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, - cl_mem, const size_t, + const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 6d10c686..23e97bd5 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset, StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, @@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset, StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_command_queue* queue, cl_event* event) { auto status = clblast::Rotmg(sd1_buffer, sd1_offset, diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h new file mode 100644 index 00000000..c690a45c --- /dev/null +++ b/test/wrapper_cblas.h @@ -0,0 +1,1667 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file implements a wrapper around a CPU BLAS library, such that its routines can be called +// in a similar way as the CLBlast routines: using alpha and beta to determine the precision. +// +// ================================================================================================= + +#ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_ +#define CLBLAST_TEST_WRAPPER_CBLAS_H_ + +#include + +#include "internal/utilities.h" + +namespace clblast { + +// OpenBLAS is not fully Netlib CBLAS compatible +#ifdef OPENBLAS_VERSION + using return_pointer_float = openblas_complex_float*; + using return_pointer_double = openblas_complex_double*; +#else + using return_pointer_float = void*; + using return_pointer_double = void*; +#endif + +// ================================================================================================= +// BLAS level-1 (vector-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SROTG/DROTG +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_srotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} +void cblasXrotg(std::vector& sa_buffer, const size_t sa_offset, + std::vector& sb_buffer, const size_t sb_offset, + std::vector& sc_buffer, const size_t sc_offset, + std::vector& ss_buffer, const size_t ss_offset) { + cblas_drotg(&sa_buffer[sa_offset], + &sb_buffer[sb_offset], + &sc_buffer[sc_offset], + &ss_buffer[ss_offset]); +} + +// Forwards the Netlib BLAS calls for SROTMG/DROTMG +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} +void cblasXrotmg(std::vector& sd1_buffer, const size_t sd1_offset, + std::vector& sd2_buffer, const size_t sd2_offset, + std::vector& sx1_buffer, const size_t sx1_offset, + const std::vector& sy1_buffer, const size_t sy1_offset, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotmg(&sd1_buffer[sd1_offset], + &sd2_buffer[sd2_offset], + &sx1_buffer[sx1_offset], + sy1_buffer[sy1_offset], + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SROT/DROT +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const float cos, + const float sin) { + cblas_srot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} +void cblasXrot(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + const double cos, + const double sin) { + cblas_drot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + cos, + sin); +} + +// Forwards the Netlib BLAS calls for SROTM/DROTM +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_srotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} +void cblasXrotm(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& sparam_buffer, const size_t sparam_offset) { + cblas_drotm(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &sparam_buffer[sparam_offset]); +} + +// Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dswap(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXswap(const size_t n, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zswap(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL +void cblasXscal(const size_t n, + const float alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_sscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dscal(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const float2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXscal(const size_t n, + const double2 alpha, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zscal(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_scopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dcopy(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ccopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXcopy(const size_t n, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zcopy(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY +void cblasXaxpy(const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_saxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_daxpy(n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_caxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXaxpy(const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zaxpy(n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SDOT/DDOT +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_sdot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXdot(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + dot_buffer[dot_offset] = cblas_ddot(n, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CDOTU/ZDOTU +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotu(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotu_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for CDOTC/ZDOTC +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_cdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} +void cblasXdotc(const size_t n, + std::vector& dot_buffer, const size_t dot_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_zdotc_sub(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&dot_buffer[dot_offset])); +} + +// Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_snrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dnrm2(n, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_scnrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXnrm2(const size_t n, + std::vector& nrm2_buffer, const size_t nrm2_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + nrm2_buffer[nrm2_offset] = cblas_dznrm2(n, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// ================================================================================================= +// BLAS level-2 (matrix-vector) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgemv(layout, a_transpose, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemv(layout, a_transpose, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dgbmv(layout, a_transpose, + m, n, kl, ku, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, + const size_t m, const size_t n, const size_t kl, const size_t ku, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgbmv(layout, a_transpose, + m, n, kl, ku, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHEMV/ZHEMV +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHBMV/ZHBMV +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhbmv(layout, triangle, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for CHPMV/ZHPMV +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} +void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double2 beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhpmv(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + beta_array.data(), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSYMV/DSYMV +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsymv(layout, triangle, + n, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSBMV/DSBMV +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_ssbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dsbmv(layout, triangle, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for SSPMV/DSPMV +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const float beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_sspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} +void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& ap_buffer, const size_t ap_offset, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const double beta, + std::vector& y_buffer, const size_t y_offset, const size_t y_inc) { + cblas_dspmv(layout, triangle, + n, + alpha, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc), + beta, + &y_buffer[y_offset], static_cast(y_inc)); +} + +// Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrmv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbmv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbmv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpmv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpmv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_strsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtrsv(layout, triangle, a_transpose, diagonal, + n, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztrsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtbsv(layout, triangle, a_transpose, diagonal, + n, k, + &a_buffer[a_offset], a_ld, + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, const size_t k, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztbsv(layout, triangle, a_transpose, diagonal, + n, k, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_stpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_dtpsv(layout, triangle, a_transpose, diagonal, + n, + &ap_buffer[ap_offset], + &x_buffer[x_offset], static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ctpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} +void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t n, + const std::vector& ap_buffer, const size_t ap_offset, + std::vector& x_buffer, const size_t x_offset, const size_t x_inc) { + cblas_ztpsv(layout, triangle, a_transpose, diagonal, + n, + reinterpret_cast(&ap_buffer[ap_offset]), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc)); +} + +// Forwards the Netlib BLAS calls for SGER/DGER +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_sger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXger(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dger(layout, + m, n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for CGERU/ZGERU +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgeru(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgeru(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CGERC/ZGERC +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXgerc(const CBLAS_ORDER layout, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zgerc(layout, + m, n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHER/ZHER +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_cher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_zher(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR/ZHPR +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_chpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_zhpr(layout, triangle, + n, + alpha, + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for CHER2/ZHER2 +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} +void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&a_buffer[a_offset]), a_ld); +} + +// Forwards the Netlib BLAS calls for CHPR2/ZHPR2 +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_chpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} +void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double2 alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zhpr2(layout, triangle, + n, + alpha_array.data(), + reinterpret_cast(&x_buffer[x_offset]), static_cast(x_inc), + reinterpret_cast(&y_buffer[y_offset]), static_cast(y_inc), + reinterpret_cast(&ap_buffer[ap_offset])); +} + +// Forwards the Netlib BLAS calls for SSYR/DSYR +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR/DSPR +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &ap_buffer[ap_offset]); +} + +// Forwards the Netlib BLAS calls for SSYR2/DSYR2 +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_ssyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} +void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& a_buffer, const size_t a_offset, const size_t a_ld) { + cblas_dsyr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &a_buffer[a_offset], a_ld); +} + +// Forwards the Netlib BLAS calls for SSPR2/DSPR2 +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const float alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_sspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} +void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, + const size_t n, + const double alpha, + const std::vector& x_buffer, const size_t x_offset, const size_t x_inc, + const std::vector& y_buffer, const size_t y_offset, const size_t y_inc, + std::vector& ap_buffer, const size_t ap_offset) { + cblas_dspr2(layout, triangle, + n, + alpha, + &x_buffer[x_offset], static_cast(x_inc), + &y_buffer[y_offset], static_cast(y_inc), + &ap_buffer[ap_offset]); +} + +// ================================================================================================= +// BLAS level-3 (matrix-matrix) routines +// ================================================================================================= + +// Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_sgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_cgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose, + const size_t m, const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zgemm(layout, a_transpose, b_transpose, + m, n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsymm(layout, side, triangle, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsymm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHEMM/ZHEMM +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_chemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zhemm(layout, side, triangle, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyrk(layout, triangle, a_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyrk(layout, triangle, a_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHERK/ZHERK +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_cherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_zherk(layout, triangle, a_transpose, + n, k, + alpha, + reinterpret_cast(&a_buffer[a_offset]), a_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_ssyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + cblas_dsyr2k(layout, triangle, ab_transpose, + n, k, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld, + beta, + &c_buffer[c_offset], c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_csyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double2 beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + const auto beta_array = std::vector{beta.real(), beta.imag()}; + cblas_zsyr2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta_array.data(), + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for CHER2K/ZHER2K +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const float beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_cher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} +void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose, + const size_t n, const size_t k, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + const std::vector& b_buffer, const size_t b_offset, const size_t b_ld, + const double beta, + std::vector& c_buffer, const size_t c_offset, const size_t c_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_zher2k(layout, triangle, ab_transpose, + n, k, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld, + beta, + reinterpret_cast(&c_buffer[c_offset]), c_ld); +} + +// Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_strsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + cblas_dtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha, + &a_buffer[a_offset], a_ld, + &b_buffer[b_offset], b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const float2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ctrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} +void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal, + const size_t m, const size_t n, + const double2 alpha, + const std::vector& a_buffer, const size_t a_offset, const size_t a_ld, + std::vector& b_buffer, const size_t b_offset, const size_t b_ld) { + const auto alpha_array = std::vector{alpha.real(), alpha.imag()}; + cblas_ztrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + alpha_array.data(), + reinterpret_cast(&a_buffer[a_offset]), a_ld, + reinterpret_cast(&b_buffer[b_offset]), b_ld); +} + +// ================================================================================================= +} // namespace clblast + +// CLBLAST_TEST_WRAPPER_CBLAS_H_ +#endif diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index fb6e83aa..89b708b8 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -65,7 +65,7 @@ template clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); @@ -73,7 +73,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { @@ -88,7 +88,7 @@ template <> clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, cl_mem sd2_buffer, const size_t sd2_offset, cl_mem sx1_buffer, const size_t sx1_offset, - cl_mem sy1_buffer, const size_t sy1_offset, + const cl_mem sy1_buffer, const size_t sy1_offset, cl_mem sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { -- cgit v1.2.3