Added a wrapper for CBLAS libraries for performance/correctness testing

author: cnugteren <web@cedricnugteren.nl> 2016-04-01 22:36:39 -0700
committer: cnugteren <web@cedricnugteren.nl> 2016-04-01 22:36:39 -0700
commit: 5c83217cf256984573924e8f89c46f393a5fcfcd (patch)
tree: b260ec46e10e12ff63d465212652523c3cfa7bc3
parent: a2056f2216526989f423a74e4bcd016dac9424f4 (diff)
9 files changed, 1825 insertions, 31 deletions
diff --git a/include/clblast.h b/include/clblast.h
index 5e5c5a46..431f2510 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -100,7 +100,7 @@ template <typename T>
 StatusCode Rotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                  cl_mem sd2_buffer, const size_t sd2_offset,
                  cl_mem sx1_buffer, const size_t sx1_offset,
-                 cl_mem sy1_buffer, const size_t sy1_offset,
+                 const cl_mem sy1_buffer, const size_t sy1_offset,
                  cl_mem sparam_buffer, const size_t sparam_offset,
                  cl_command_queue* queue, cl_event* event = nullptr);
 
diff --git a/include/clblast_c.h b/include/clblast_c.h
index dcb3ae3a..f72cff3a 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -112,13 +112,13 @@ StatusCode PUBLIC_API CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
 StatusCode PUBLIC_API CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                                     cl_mem sd2_buffer, const size_t sd2_offset,
                                     cl_mem sx1_buffer, const size_t sx1_offset,
-                                    cl_mem sy1_buffer, const size_t sy1_offset,
+                                    const cl_mem sy1_buffer, const size_t sy1_offset,
                                     cl_mem sparam_buffer, const size_t sparam_offset,
                                     cl_command_queue* queue, cl_event* event);
 StatusCode PUBLIC_API CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                                     cl_mem sd2_buffer, const size_t sd2_offset,
                                     cl_mem sx1_buffer, const size_t sx1_offset,
-                                    cl_mem sy1_buffer, const size_t sy1_offset,
+                                    const cl_mem sy1_buffer, const size_t sy1_offset,
                                     cl_mem sparam_buffer, const size_t sparam_offset,
                                     cl_command_queue* queue, cl_event* event);
 
diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py
index 9323bc4d..5a58ab53 100644
--- a/scripts/generator/datatype.py
+++ b/scripts/generator/datatype.py
@@ -58,5 +58,10 @@ class DataType():
 			return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
 		return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
 
+	# Current scalar is complex
+	def IsComplex(self, scalar):
+		return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
+		        (scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
+
 
 # ==================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 6e2b2ed2..36a9bf40 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -8,12 +8,13 @@
 #   Cedric Nugteren <www.cedricnugteren.nl>
 #
 # This script automatically generates the bodies of the following files, creating the full CLBlast
-# API interface and implementation (C, C++, and clBLAS wrapper):
+# API interface and implementation (C, C++, and reference BLAS wrappers):
 #    clblast.h
 #    clblast.cc
 #    clblast_c.h
 #    clblast_c.cc
 #    wrapper_clblas.h
+#    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
 #    test/correctness/routines/levelX/xYYYY.cc
 #    test/performance/routines/levelX/xYYYY.cc
@@ -55,7 +56,7 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
 routines = [
 [ # Level 1: vector-vector
   Routine(False, "1", "rotg",  T,  [S,D],     [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation"),
-  Routine(False, "1", "rotmg", T,  [S,D],     [], [], [], ["sd1","sd2","sx1","sy1","sparam"], [], "", "Generate modified givens plane rotation"),
+  Routine(False, "1", "rotmg", T,  [S,D],     [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation"),
   Routine(False, "1", "rot",   T,  [S,D],     ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation"),
   Routine(False, "1", "rotm",  T,  [S,D],     ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation"),
   Routine(True,  "1", "swap",  T,  [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors"),
@@ -220,11 +221,11 @@ def wrapper_clblas(routines):
 	for routine in routines:
 		result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
 		if routine.NoScalars():
-			result += routine.RoutineHeaderWrapper(routine.template, True, 21)+";\n"
+			result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
 		for flavour in routine.flavours:
 			indent = " "*(17 + routine.Length())
-			result += routine.RoutineHeaderWrapper(flavour, False, 21)+" {\n"
-			arguments = routine.ArgumentsWrapper(flavour)
+			result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
+			arguments = routine.ArgumentsWrapperCL(flavour)
 			if routine.scratch:
 				result += "  auto queue = Queue(queues[0]);\n"
 				result += "  auto context = queue.GetContext();\n"
@@ -236,6 +237,41 @@ def wrapper_clblas(routines):
 			result += "\n}\n"
 	return result
 
+# The wrapper to the reference CBLAS routines (for performance/correctness testing)
+def wrapper_cblas(routines):
+	result = ""
+	for routine in routines:
+		result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
+		for flavour in routine.flavours:
+			indent = " "*(10 + routine.Length())
+			result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
+			arguments = routine.ArgumentsWrapperC(flavour)
+
+			# Double-precision scalars
+			for scalar in routine.scalars:
+				if flavour.IsComplex(scalar):
+					result += "  const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
+
+			# Special case for scalar outputs
+			assignment = ""
+			postfix = ""
+			extra_argument = ""
+			for output_buffer in routine.outputs:
+				if output_buffer in routine.ScalarBuffersFirst():
+					if flavour in [C,Z]:
+						postfix += "_sub"
+						indent += "    "
+						extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
+					else:
+						assignment = output_buffer+"_buffer["+output_buffer+"_offset] = "
+						indent += " "*len(assignment)
+
+			result += "  "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
+			result += (",\n"+indent).join([a for a in arguments])
+			result += extra_argument+");"
+			result += "\n}\n"
+	return result
+
 # ==================================================================================================
 
 # Checks for the number of command-line arguments
@@ -251,9 +287,10 @@ files = [
   path_clblast+"/include/clblast_c.h",
   path_clblast+"/src/clblast_c.cc",
   path_clblast+"/test/wrapper_clblas.h",
+  path_clblast+"/test/wrapper_cblas.h",
 ]
-header_lines = [84, 65, 93, 22, 22]
-footer_lines = [6, 3, 9, 2, 6]
+header_lines = [84, 65, 93, 22, 22, 31]
+footer_lines = [6, 3, 9, 2, 6, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
 for f in files:
@@ -287,6 +324,8 @@ for i in xrange(0,len(files)):
 				body += clblast_c_cc(routines[level-1])
 			if i == 4:
 				body += wrapper_clblas(routines[level-1])
+			if i == 5:
+				body += wrapper_cblas(routines[level-1])
 		f.write("".join(file_header))
 		f.write(body)
 		f.write("".join(file_footer))
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
index 02040583..fffa19f6 100644
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@@ -28,7 +28,7 @@ def OptionToCLBlast(x):
 	}[x]
 
 # As above, but for clBLAS data-types
-def OptionToWrapper(x):
+def OptionToWrapperCL(x):
 	return {
 	    'layout': "clblasOrder",
 	    'a_transpose': "clblasTranspose",
@@ -39,6 +39,18 @@ def OptionToWrapper(x):
 	    'diagonal': "clblasDiag",
 	}[x]
 
+# As above, but for CBLAS data-types
+def OptionToWrapperC(x):
+	return {
+	    'layout': "CBLAS_ORDER",
+	    'a_transpose': "CBLAS_TRANSPOSE",
+	    'b_transpose': "CBLAS_TRANSPOSE",
+	    'ab_transpose': "CBLAS_TRANSPOSE",
+	    'side': "CBLAS_SIDE",
+	    'triangle': "CBLAS_UPLO",
+	    'diagonal': "CBLAS_DIAG",
+	}[x]
+
 # ==================================================================================================
 
 # Class holding routine-specific information (e.g. name, which arguments, which precisions)
@@ -119,6 +131,16 @@ class Routine():
 			return [", ".join(a+b+c)]
 		return []
 
+	# As above but as vectors
+	def BufferDefVector(self, name, flavour):
+		prefix = "const " if (name in self.inputs) else ""
+		if (name in self.inputs) or (name in self.outputs):
+			a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
+			b = ["const size_t "+name+"_offset"]
+			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
+			return [", ".join(a+b+c)]
+		return []
+
 	# As above but with Claduc buffers
 	def BufferCladuc(self, name):
 		if (name in self.inputs) or (name in self.outputs):
@@ -129,7 +151,7 @@ class Routine():
 		return []
 
 	# As above but with a static cast for clBLAS wrapper
-	def BufferWrapper(self, name):
+	def BufferWrapperCL(self, name):
 		if (name in self.inputs) or (name in self.outputs):
 			a = [name+"_buffer"]
 			b = [name+"_offset"]
@@ -141,6 +163,24 @@ class Routine():
 			return [", ".join(a+b+c)]
 		return []
 
+	# As above but with a static cast for CBLAS wrapper
+	def BufferWrapperC(self, name, flavour):
+		prefix = "const " if (name in self.inputs) else ""
+		if (name in self.inputs) or (name in self.outputs):
+			if name == "sy1":
+				a = [name+"_buffer["+name+"_offset]"]
+			elif flavour.precision_name in ["C","Z"]:
+				a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
+			else:
+				a = ["&"+name+"_buffer["+name+"_offset]"]
+			c = []
+			if (name in ["x","y"]):
+				c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
+			elif (name in ["a","b","c"]):
+				c = [name+"_"+self.Postfix(name)]
+			return [", ".join(a+c)]
+		return []
+
 	# As above, but only data-types
 	def BufferType(self, name):
 		prefix = "const " if (name in self.inputs) else ""
@@ -179,6 +219,14 @@ class Routine():
 			return [name]
 		return []
 
+	# Retrieves the use of a scalar for CBLAS (alpha/beta)
+	def ScalarUseWrapperC(self, name, flavour):
+		if name in self.scalars:
+			if flavour.IsComplex(name):
+				return [name+"_array.data()"]
+			return [name]
+		return []
+
 	# Retrieves the definition of a scalar (alpha/beta)
 	def ScalarDef(self, name, flavour):
 		if name in self.scalars:
@@ -246,9 +294,16 @@ class Routine():
 		return []
 
 	# As above, but now using clBLAS data-types
-	def OptionsDefWrapper(self):
+	def OptionsDefWrapperCL(self):
+		if self.options:
+			definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
+			return [", ".join(definitions)]
+		return []
+
+	# As above, but now using CBLAS data-types
+	def OptionsDefWrapperC(self):
 		if self.options:
-			definitions = ["const "+OptionToWrapper(o)+" "+o for o in self.options]
+			definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
 			return [", ".join(definitions)]
 		return []
 
@@ -284,16 +339,26 @@ class Routine():
 		        list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
 
 	# As above, but for the clBLAS wrapper
-	def ArgumentsWrapper(self, flavour):
+	def ArgumentsWrapperCL(self, flavour):
 		return (self.Options() + self.Sizes() +
-		        list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersFirst()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarUseWrapper("alpha", flavour) +
-		        list(chain(*[self.BufferWrapper(b) for b in self.BuffersFirst()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
 		        self.ScalarUseWrapper("beta", flavour) +
-		        list(chain(*[self.BufferWrapper(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferWrapper(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
 		        list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
 
+	# As above, but for the CBLAS wrapper
+	def ArgumentsWrapperC(self, flavour):
+		return (self.Options() + self.Sizes() +
+		        self.ScalarUseWrapperC("alpha", flavour) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
+		        self.ScalarUseWrapperC("beta", flavour) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
+
 	# Retrieves a combination of all the argument definitions
 	def ArgumentsDef(self, flavour):
 		return (self.OptionsDef() + self.SizesDef() +
@@ -306,8 +371,8 @@ class Routine():
 		        list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
 
 	# As above, but clBLAS wrapper plain datatypes
-	def ArgumentsDefWrapper(self, flavour):
-		return (self.OptionsDefWrapper() + self.SizesDef() +
+	def ArgumentsDefWrapperCL(self, flavour):
+		return (self.OptionsDefWrapperCL() + self.SizesDef() +
 		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarDefPlain("alpha", flavour) +
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
@@ -315,6 +380,17 @@ class Routine():
 		        list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
 		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
 		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
+
+	# As above, but CBLAS wrapper plain datatypes
+	def ArgumentsDefWrapperC(self, flavour):
+		return (self.OptionsDefWrapperC() + self.SizesDef() +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
+		        self.ScalarDefPlain("alpha", flavour) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
+		        self.ScalarDefPlain("beta", flavour) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
 	
 	# Retrieves a combination of all the argument types
 	def ArgumentsType(self, flavour):
@@ -356,7 +432,7 @@ class Routine():
 		return result
 
 	# As above, but now for the clBLAS wrapper
-	def RoutineHeaderWrapper(self, flavour, def_only, spaces):
+	def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
 		template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
 		indent = " "*(spaces + self.Length() + len(template))
 		result = ""
@@ -366,9 +442,16 @@ class Routine():
 				result += flavour.name
 			result += ">\n"
 		result += "clblasStatus clblasX"+self.name+template+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapper(flavour)])
+		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
 		result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
 		result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
 		return result
 
+	# As above, but now for the CBLAS wrapper
+	def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
+		indent = " "*(spaces + self.Length())
+		result = "void cblasX"+self.name+"("
+		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
+		return result
+
 # ==================================================================================================
diff --git a/src/clblast.cc b/src/clblast.cc
index fc50ffae..75893ee9 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -93,7 +93,7 @@ template <typename T>
 StatusCode Rotmg(cl_mem, const size_t,
                  cl_mem, const size_t,
                  cl_mem, const size_t,
-                 cl_mem, const size_t,
+                 const cl_mem, const size_t,
                  cl_mem, const size_t,
                  cl_command_queue*, cl_event*) {
   return StatusCode::kNotImplemented;
@@ -101,13 +101,13 @@ StatusCode Rotmg(cl_mem, const size_t,
 template StatusCode PUBLIC_API Rotmg<float>(cl_mem, const size_t,
                                             cl_mem, const size_t,
                                             cl_mem, const size_t,
-                                            cl_mem, const size_t,
+                                            const cl_mem, const size_t,
                                             cl_mem, const size_t,
                                             cl_command_queue*, cl_event*);
 template StatusCode PUBLIC_API Rotmg<double>(cl_mem, const size_t,
                                              cl_mem, const size_t,
                                              cl_mem, const size_t,
-                                             cl_mem, const size_t,
+                                             const cl_mem, const size_t,
                                              cl_mem, const size_t,
                                              cl_command_queue*, cl_event*);
 
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 6d10c686..23e97bd5 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -55,7 +55,7 @@ StatusCode CLBlastDrotg(cl_mem sa_buffer, const size_t sa_offset,
 StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                          cl_mem sd2_buffer, const size_t sd2_offset,
                          cl_mem sx1_buffer, const size_t sx1_offset,
-                         cl_mem sy1_buffer, const size_t sy1_offset,
+                         const cl_mem sy1_buffer, const size_t sy1_offset,
                          cl_mem sparam_buffer, const size_t sparam_offset,
                          cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Rotmg<float>(sd1_buffer, sd1_offset,
@@ -69,7 +69,7 @@ StatusCode CLBlastSrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
 StatusCode CLBlastDrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                          cl_mem sd2_buffer, const size_t sd2_offset,
                          cl_mem sx1_buffer, const size_t sx1_offset,
-                         cl_mem sy1_buffer, const size_t sy1_offset,
+                         const cl_mem sy1_buffer, const size_t sy1_offset,
                          cl_mem sparam_buffer, const size_t sparam_offset,
                          cl_command_queue* queue, cl_event* event) {
   auto status = clblast::Rotmg<double>(sd1_buffer, sd1_offset,
diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h
new file mode 100644
index 00000000..c690a45c
--- /dev/null
+++ b/test/wrapper_cblas.h
@@ -0,0 +1,1667 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a wrapper around a CPU BLAS library, such that its routines can be called
+// in a similar way as the CLBlast routines: using alpha and beta to determine the precision.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_WRAPPER_CBLAS_H_
+#define CLBLAST_TEST_WRAPPER_CBLAS_H_
+
+#include <cblas.h>
+
+#include "internal/utilities.h"
+
+namespace clblast {
+
+// OpenBLAS is not fully Netlib CBLAS compatible
+#ifdef OPENBLAS_VERSION
+  using return_pointer_float = openblas_complex_float*;
+  using return_pointer_double = openblas_complex_double*;
+#else
+  using return_pointer_float = void*;
+  using return_pointer_double = void*;
+#endif
+
+// =================================================================================================
+// BLAS level-1 (vector-vector) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SROTG/DROTG
+void cblasXrotg(std::vector<float>& sa_buffer, const size_t sa_offset,
+                std::vector<float>& sb_buffer, const size_t sb_offset,
+                std::vector<float>& sc_buffer, const size_t sc_offset,
+                std::vector<float>& ss_buffer, const size_t ss_offset) {
+  cblas_srotg(&sa_buffer[sa_offset],
+              &sb_buffer[sb_offset],
+              &sc_buffer[sc_offset],
+              &ss_buffer[ss_offset]);
+}
+void cblasXrotg(std::vector<double>& sa_buffer, const size_t sa_offset,
+                std::vector<double>& sb_buffer, const size_t sb_offset,
+                std::vector<double>& sc_buffer, const size_t sc_offset,
+                std::vector<double>& ss_buffer, const size_t ss_offset) {
+  cblas_drotg(&sa_buffer[sa_offset],
+              &sb_buffer[sb_offset],
+              &sc_buffer[sc_offset],
+              &ss_buffer[ss_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SROTMG/DROTMG
+void cblasXrotmg(std::vector<float>& sd1_buffer, const size_t sd1_offset,
+                 std::vector<float>& sd2_buffer, const size_t sd2_offset,
+                 std::vector<float>& sx1_buffer, const size_t sx1_offset,
+                 const std::vector<float>& sy1_buffer, const size_t sy1_offset,
+                 std::vector<float>& sparam_buffer, const size_t sparam_offset) {
+  cblas_srotmg(&sd1_buffer[sd1_offset],
+               &sd2_buffer[sd2_offset],
+               &sx1_buffer[sx1_offset],
+               sy1_buffer[sy1_offset],
+               &sparam_buffer[sparam_offset]);
+}
+void cblasXrotmg(std::vector<double>& sd1_buffer, const size_t sd1_offset,
+                 std::vector<double>& sd2_buffer, const size_t sd2_offset,
+                 std::vector<double>& sx1_buffer, const size_t sx1_offset,
+                 const std::vector<double>& sy1_buffer, const size_t sy1_offset,
+                 std::vector<double>& sparam_buffer, const size_t sparam_offset) {
+  cblas_drotmg(&sd1_buffer[sd1_offset],
+               &sd2_buffer[sd2_offset],
+               &sx1_buffer[sx1_offset],
+               sy1_buffer[sy1_offset],
+               &sparam_buffer[sparam_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SROT/DROT
+void cblasXrot(const size_t n,
+               std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+               const float cos,
+               const float sin) {
+  cblas_srot(n,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &y_buffer[y_offset], static_cast<int>(y_inc),
+             cos,
+             sin);
+}
+void cblasXrot(const size_t n,
+               std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+               const double cos,
+               const double sin) {
+  cblas_drot(n,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &y_buffer[y_offset], static_cast<int>(y_inc),
+             cos,
+             sin);
+}
+
+// Forwards the Netlib BLAS calls for SROTM/DROTM
+void cblasXrotm(const size_t n,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float>& sparam_buffer, const size_t sparam_offset) {
+  cblas_srotm(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &sparam_buffer[sparam_offset]);
+}
+void cblasXrotm(const size_t n,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double>& sparam_buffer, const size_t sparam_offset) {
+  cblas_drotm(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &sparam_buffer[sparam_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP
+void cblasXswap(const size_t n,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_sswap(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dswap(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_cswap(n,
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXswap(const size_t n,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_zswap(n,
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
+void cblasXscal(const size_t n,
+                const float alpha,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_sscal(n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+                const double alpha,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dscal(n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+                const float2 alpha,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_cscal(n,
+              alpha_array.data(),
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXscal(const size_t n,
+                const double2 alpha,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zscal(n,
+              alpha_array.data(),
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
+void cblasXcopy(const size_t n,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_scopy(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dcopy(n,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_ccopy(n,
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXcopy(const size_t n,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_zcopy(n,
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
+void cblasXaxpy(const size_t n,
+                const float alpha,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_saxpy(n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+                const double alpha,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_daxpy(n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_caxpy(n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXaxpy(const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zaxpy(n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SDOT/DDOT
+void cblasXdot(const size_t n,
+               std::vector<float>& dot_buffer, const size_t dot_offset,
+               const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  dot_buffer[dot_offset] = cblas_sdot(n,
+                                      &x_buffer[x_offset], static_cast<int>(x_inc),
+                                      &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXdot(const size_t n,
+               std::vector<double>& dot_buffer, const size_t dot_offset,
+               const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  dot_buffer[dot_offset] = cblas_ddot(n,
+                                      &x_buffer[x_offset], static_cast<int>(x_inc),
+                                      &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CDOTU/ZDOTU
+void cblasXdotu(const size_t n,
+                std::vector<float2>& dot_buffer, const size_t dot_offset,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_cdotu_sub(n,
+                  reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                  reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                  reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
+}
+void cblasXdotu(const size_t n,
+                std::vector<double2>& dot_buffer, const size_t dot_offset,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_zdotu_sub(n,
+                  reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                  reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                  reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
+}
+
+// Forwards the Netlib BLAS calls for CDOTC/ZDOTC
+void cblasXdotc(const size_t n,
+                std::vector<float2>& dot_buffer, const size_t dot_offset,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_cdotc_sub(n,
+                  reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                  reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                  reinterpret_cast<return_pointer_float>(&dot_buffer[dot_offset]));
+}
+void cblasXdotc(const size_t n,
+                std::vector<double2>& dot_buffer, const size_t dot_offset,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_zdotc_sub(n,
+                  reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+                  reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+                  reinterpret_cast<return_pointer_double>(&dot_buffer[dot_offset]));
+}
+
+// Forwards the Netlib BLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2
+void cblasXnrm2(const size_t n,
+                std::vector<float>& nrm2_buffer, const size_t nrm2_offset,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  nrm2_buffer[nrm2_offset] = cblas_snrm2(n,
+                                         &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+                std::vector<double>& nrm2_buffer, const size_t nrm2_offset,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  nrm2_buffer[nrm2_offset] = cblas_dnrm2(n,
+                                         &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+                std::vector<float2>& nrm2_buffer, const size_t nrm2_offset,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  nrm2_buffer[nrm2_offset] = cblas_scnrm2(n,
+                                         reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXnrm2(const size_t n,
+                std::vector<double2>& nrm2_buffer, const size_t nrm2_offset,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  nrm2_buffer[nrm2_offset] = cblas_dznrm2(n,
+                                         reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// =================================================================================================
+// BLAS level-2 (matrix-vector) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SGEMV/DGEMV/CGEMV/ZGEMV
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float beta,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_sgemv(layout, a_transpose,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double beta,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dgemv(layout, a_transpose,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float2 beta,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_cgemv(layout, a_transpose,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double2 beta,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zgemv(layout, a_transpose,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float beta,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_sgbmv(layout, a_transpose,
+              m, n, kl, ku,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double beta,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dgbmv(layout, a_transpose,
+              m, n, kl, ku,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float2 beta,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_cgbmv(layout, a_transpose,
+              m, n, kl, ku,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double2 beta,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zgbmv(layout, a_transpose,
+              m, n, kl, ku,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHEMV/ZHEMV
+void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float2 beta,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_chemv(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double2 beta,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zhemv(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHBMV/ZHBMV
+void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n, const size_t k,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float2 beta,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_chbmv(layout, triangle,
+              n, k,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n, const size_t k,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double2 beta,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zhbmv(layout, triangle,
+              n, k,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for CHPMV/ZHPMV
+void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& ap_buffer, const size_t ap_offset,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float2 beta,
+                std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_chpmv(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<float*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+void cblasXhpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& ap_buffer, const size_t ap_offset,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double2 beta,
+                std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zhpmv(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              beta_array.data(),
+              reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSYMV/DSYMV
+void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float beta,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_ssymv(layout, triangle,
+              n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double beta,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dsymv(layout, triangle,
+              n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSBMV/DSBMV
+void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n, const size_t k,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float beta,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_ssbmv(layout, triangle,
+              n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n, const size_t k,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double beta,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dsbmv(layout, triangle,
+              n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for SSPMV/DSPMV
+void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float alpha,
+                const std::vector<float>& ap_buffer, const size_t ap_offset,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const float beta,
+                std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_sspmv(layout, triangle,
+              n,
+              alpha,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double alpha,
+                const std::vector<double>& ap_buffer, const size_t ap_offset,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const double beta,
+                std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  cblas_dspmv(layout, triangle,
+              n,
+              alpha,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              beta,
+              &y_buffer[y_offset], static_cast<int>(y_inc));
+}
+
+// Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_strmv(layout, triangle, a_transpose, diagonal,
+              n,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtrmv(layout, triangle, a_transpose, diagonal,
+              n,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctrmv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztrmv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_stbmv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtbmv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctbmv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztbmv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float>& ap_buffer, const size_t ap_offset,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_stpmv(layout, triangle, a_transpose, diagonal,
+              n,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double>& ap_buffer, const size_t ap_offset,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtpmv(layout, triangle, a_transpose, diagonal,
+              n,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float2>& ap_buffer, const size_t ap_offset,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctpmv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double2>& ap_buffer, const size_t ap_offset,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztpmv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_strsv(layout, triangle, a_transpose, diagonal,
+              n,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtrsv(layout, triangle, a_transpose, diagonal,
+              n,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctrsv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztrsv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STBSV/DTBSV/CTBSV/ZTBSV
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_stbsv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtbsv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              &a_buffer[a_offset], a_ld,
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctbsv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtbsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztbsv(layout, triangle, a_transpose, diagonal,
+              n, k,
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for STPSV/DTPSV/CTPSV/ZTPSV
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float>& ap_buffer, const size_t ap_offset,
+                std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_stpsv(layout, triangle, a_transpose, diagonal,
+              n,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double>& ap_buffer, const size_t ap_offset,
+                std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_dtpsv(layout, triangle, a_transpose, diagonal,
+              n,
+              &ap_buffer[ap_offset],
+              &x_buffer[x_offset], static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<float2>& ap_buffer, const size_t ap_offset,
+                std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ctpsv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const float*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<float*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+void cblasXtpsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<double2>& ap_buffer, const size_t ap_offset,
+                std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  cblas_ztpsv(layout, triangle, a_transpose, diagonal,
+              n,
+              reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
+              reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
+}
+
+// Forwards the Netlib BLAS calls for SGER/DGER
+void cblasXger(const CBLAS_ORDER layout,
+               const size_t m, const size_t n,
+               const float alpha,
+               const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+               std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_sger(layout,
+             m, n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &y_buffer[y_offset], static_cast<int>(y_inc),
+             &a_buffer[a_offset], a_ld);
+}
+void cblasXger(const CBLAS_ORDER layout,
+               const size_t m, const size_t n,
+               const double alpha,
+               const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+               std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_dger(layout,
+             m, n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &y_buffer[y_offset], static_cast<int>(y_inc),
+             &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CGERU/ZGERU
+void cblasXgeru(const CBLAS_ORDER layout,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_cgeru(layout,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXgeru(const CBLAS_ORDER layout,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zgeru(layout,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CGERC/ZGERC
+void cblasXgerc(const CBLAS_ORDER layout,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_cgerc(layout,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXgerc(const CBLAS_ORDER layout,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zgerc(layout,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHER/ZHER
+void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const float alpha,
+               const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_cher(layout, triangle,
+             n,
+             alpha,
+             reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+             reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXher(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const double alpha,
+               const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_zher(layout, triangle,
+             n,
+             alpha,
+             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+             reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHPR/ZHPR
+void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const float alpha,
+               const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<float2>& ap_buffer, const size_t ap_offset) {
+  cblas_chpr(layout, triangle,
+             n,
+             alpha,
+             reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+             reinterpret_cast<float*>(&ap_buffer[ap_offset]));
+}
+void cblasXhpr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const double alpha,
+               const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<double2>& ap_buffer, const size_t ap_offset) {
+  cblas_zhpr(layout, triangle,
+             n,
+             alpha,
+             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+             reinterpret_cast<double*>(&ap_buffer[ap_offset]));
+}
+
+// Forwards the Netlib BLAS calls for CHER2/ZHER2
+void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_cher2(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<float*>(&a_buffer[a_offset]), a_ld);
+}
+void cblasXher2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zher2(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<double*>(&a_buffer[a_offset]), a_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHPR2/ZHPR2
+void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float2>& ap_buffer, const size_t ap_offset) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_chpr2(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const float*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<float*>(&ap_buffer[ap_offset]));
+}
+void cblasXhpr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double2>& ap_buffer, const size_t ap_offset) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zhpr2(layout, triangle,
+              n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
+              reinterpret_cast<const double*>(&y_buffer[y_offset]), static_cast<int>(y_inc),
+              reinterpret_cast<double*>(&ap_buffer[ap_offset]));
+}
+
+// Forwards the Netlib BLAS calls for SSYR/DSYR
+void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const float alpha,
+               const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_ssyr(layout, triangle,
+             n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &a_buffer[a_offset], a_ld);
+}
+void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const double alpha,
+               const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_dsyr(layout, triangle,
+             n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSPR/DSPR
+void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const float alpha,
+               const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<float>& ap_buffer, const size_t ap_offset) {
+  cblas_sspr(layout, triangle,
+             n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &ap_buffer[ap_offset]);
+}
+void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const double alpha,
+               const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<double>& ap_buffer, const size_t ap_offset) {
+  cblas_dspr(layout, triangle,
+             n,
+             alpha,
+             &x_buffer[x_offset], static_cast<int>(x_inc),
+             &ap_buffer[ap_offset]);
+}
+
+// Forwards the Netlib BLAS calls for SSYR2/DSYR2
+void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float alpha,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_ssyr2(layout, triangle,
+              n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &a_buffer[a_offset], a_ld);
+}
+void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double alpha,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  cblas_dsyr2(layout, triangle,
+              n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &a_buffer[a_offset], a_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSPR2/DSPR2
+void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const float alpha,
+                const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<float>& ap_buffer, const size_t ap_offset) {
+  cblas_sspr2(layout, triangle,
+              n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &ap_buffer[ap_offset]);
+}
+void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const double alpha,
+                const std::vector<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<double>& ap_buffer, const size_t ap_offset) {
+  cblas_dspr2(layout, triangle,
+              n,
+              alpha,
+              &x_buffer[x_offset], static_cast<int>(x_inc),
+              &y_buffer[y_offset], static_cast<int>(y_inc),
+              &ap_buffer[ap_offset]);
+}
+
+// =================================================================================================
+// BLAS level-3 (matrix-matrix) routines
+// =================================================================================================
+
+// Forwards the Netlib BLAS calls for SGEMM/DGEMM/CGEMM/ZGEMM
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const float beta,
+                std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_sgemm(layout, a_transpose, b_transpose,
+              m, n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const double beta,
+                std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_dgemm(layout, a_transpose, b_transpose,
+              m, n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const float2 beta,
+                std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_cgemm(layout, a_transpose, b_transpose,
+              m, n, k,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const double2 beta,
+                std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zgemm(layout, a_transpose, b_transpose,
+              m, n, k,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const float beta,
+                std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_ssymm(layout, side, triangle,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const double beta,
+                std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_dsymm(layout, side, triangle,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const float2 beta,
+                std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_csymm(layout, side, triangle,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const double2 beta,
+                std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zsymm(layout, side, triangle,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHEMM/ZHEMM
+void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const float2 beta,
+                std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_chemm(layout, side, triangle,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const double2 beta,
+                std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zhemm(layout, side, triangle,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+              beta_array.data(),
+              reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYRK/DSYRK/CSYRK/ZSYRK
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const float beta,
+                std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_ssyrk(layout, triangle, a_transpose,
+              n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const double beta,
+                std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_dsyrk(layout, triangle, a_transpose,
+              n, k,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              beta,
+              &c_buffer[c_offset], c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const float2 beta,
+                std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_csyrk(layout, triangle, a_transpose,
+              n, k,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              beta_array.data(),
+              reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const double2 beta,
+                std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zsyrk(layout, triangle, a_transpose,
+              n, k,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              beta_array.data(),
+              reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHERK/ZHERK
+void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const float alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const float beta,
+                std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_cherk(layout, triangle, a_transpose,
+              n, k,
+              alpha,
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              beta,
+              reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const double alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const double beta,
+                std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_zherk(layout, triangle, a_transpose,
+              n, k,
+              alpha,
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              beta,
+              reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const float alpha,
+                 const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const float beta,
+                 std::vector<float>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_ssyr2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha,
+               &a_buffer[a_offset], a_ld,
+               &b_buffer[b_offset], b_ld,
+               beta,
+               &c_buffer[c_offset], c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const double alpha,
+                 const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const double beta,
+                 std::vector<double>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  cblas_dsyr2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha,
+               &a_buffer[a_offset], a_ld,
+               &b_buffer[b_offset], b_ld,
+               beta,
+               &c_buffer[c_offset], c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const float2 alpha,
+                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const float2 beta,
+                 std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<float>{beta.real(), beta.imag()};
+  cblas_csyr2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha_array.data(),
+               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+               reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+               beta_array.data(),
+               reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const double2 alpha,
+                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const double2 beta,
+                 std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  const auto beta_array = std::vector<double>{beta.real(), beta.imag()};
+  cblas_zsyr2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha_array.data(),
+               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+               reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+               beta_array.data(),
+               reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for CHER2K/ZHER2K
+void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const float2 alpha,
+                 const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const float beta,
+                 std::vector<float2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_cher2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha_array.data(),
+               reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+               reinterpret_cast<const float*>(&b_buffer[b_offset]), b_ld,
+               beta,
+               reinterpret_cast<float*>(&c_buffer[c_offset]), c_ld);
+}
+void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const double2 alpha,
+                 const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const double beta,
+                 std::vector<double2>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_zher2k(layout, triangle, ab_transpose,
+               n, k,
+               alpha_array.data(),
+               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+               reinterpret_cast<const double*>(&b_buffer[b_offset]), b_ld,
+               beta,
+               reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
+}
+
+// Forwards the Netlib BLAS calls for STRMM/DTRMM/CTRMM/ZTRMM
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  cblas_strmm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  cblas_dtrmm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_ctrmm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
+}
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_ztrmm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
+}
+
+// Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const float alpha,
+                const std::vector<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  cblas_strsm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const double alpha,
+                const std::vector<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  cblas_dtrsm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha,
+              &a_buffer[a_offset], a_ld,
+              &b_buffer[b_offset], b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const float2 alpha,
+                const std::vector<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<float2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  const auto alpha_array = std::vector<float>{alpha.real(), alpha.imag()};
+  cblas_ctrsm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const float*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<float*>(&b_buffer[b_offset]), b_ld);
+}
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const double2 alpha,
+                const std::vector<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<double2>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  const auto alpha_array = std::vector<double>{alpha.real(), alpha.imag()};
+  cblas_ztrsm(layout, side, triangle, a_transpose, diagonal,
+              m, n,
+              alpha_array.data(),
+              reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
+              reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_WRAPPER_CBLAS_H_
+#endif
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h
index fb6e83aa..89b708b8 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.h
@@ -65,7 +65,7 @@ template <typename T>
 clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
                           cl_mem sd2_buffer, const size_t sd2_offset,
                           cl_mem sx1_buffer, const size_t sx1_offset,
-                          cl_mem sy1_buffer, const size_t sy1_offset,
+                          const cl_mem sy1_buffer, const size_t sy1_offset,
                           cl_mem sparam_buffer, const size_t sparam_offset,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
@@ -73,7 +73,7 @@ template <>
 clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
                                  cl_mem sd2_buffer, const size_t sd2_offset,
                                  cl_mem sx1_buffer, const size_t sx1_offset,
-                                 cl_mem sy1_buffer, const size_t sy1_offset,
+                                 const cl_mem sy1_buffer, const size_t sy1_offset,
                                  cl_mem sparam_buffer, const size_t sparam_offset,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
@@ -88,7 +88,7 @@ template <>
 clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
                                   cl_mem sd2_buffer, const size_t sd2_offset,
                                   cl_mem sx1_buffer, const size_t sx1_offset,
-                                  cl_mem sy1_buffer, const size_t sy1_offset,
+                                  const cl_mem sy1_buffer, const size_t sy1_offset,
                                   cl_mem sparam_buffer, const size_t sparam_offset,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
author	cnugteren <web@cedricnugteren.nl>	2016-04-01 22:36:39 -0700
committer	cnugteren <web@cedricnugteren.nl>	2016-04-01 22:36:39 -0700
commit	5c83217cf256984573924e8f89c46f393a5fcfcd (patch)
tree	b260ec46e10e12ff63d465212652523c3cfa7bc3
parent	a2056f2216526989f423a74e4bcd016dac9424f4 (diff)