5 files changed, 90 insertions, 48 deletions
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h
index 0a38abb2..b5577cfa 100644
--- a/include/clblast_netlib_c.h
+++ b/include/clblast_netlib_c.h
@@ -46,6 +46,24 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131,
                                 CLBlastDiagonalUnit = 132 } CLBlastDiagonal;
 typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide;
 
+// For full compatibility with CBLAS
+typedef CLBlastLayout CBLAS_ORDER;
+typedef CLBlastTranspose CBLAS_TRANSPOSE;
+typedef CLBlastTriangle CBLAS_UPLO;
+typedef CLBlastDiagonal CBLAS_DIAG;
+typedef CLBlastSide CBLAS_SIDE;
+#define CblasRowMajor CLBlastLayoutRowMajor
+#define CblasColMajor CLBlastLayoutColMajor
+#define CblasNoTrans CLBlastTransposeNo
+#define CblasTrans CLBlastTransposeYes
+#define CblasConjTrans CLBlastTransposeConjugate
+#define CblasUpper CLBlastTriangleUpper
+#define CblasLower CLBlastTriangleLower
+#define CblasNonUnit CLBlastDiagonalNonUnit
+#define CblasUnit CLBlastDiagonalUnit
+#define CblasLeft CLBlastSideLeft
+#define CblasRight CLBlastSideRight
+
 // =================================================================================================
 // BLAS level-1 (vector-vector) routines
 // =================================================================================================
@@ -64,12 +82,12 @@ void PUBLIC_API cblas_drotg(double* sa,
 void PUBLIC_API cblas_srotmg(float* sd1,
                              float* sd2,
                              float* sx1,
-                             const float* sy1,
+                             const float sy1,
                              float* sparam);
 void PUBLIC_API cblas_drotmg(double* sd1,
                              double* sd2,
                              double* sx1,
-                             const double* sy1,
+                             const double sy1,
                              double* sparam);
 
 // Apply givens plane rotation: SROT/DROT
@@ -163,20 +181,24 @@ double PUBLIC_API cblas_ddot(const int n,
                              const double* y, const int y_inc);
 
 // Dot product of two complex vectors: CDOTU/ZDOTU
-float PUBLIC_API cblas_cdotu(const int n,
-                             const void* x, const int x_inc,
-                             const void* y, const int y_inc);
-double PUBLIC_API cblas_zdotu(const int n,
-                              const void* x, const int x_inc,
-                              const void* y, const int y_inc);
+void PUBLIC_API cblas_cdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotu_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
 
 // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC
-float PUBLIC_API cblas_cdotc(const int n,
-                             const void* x, const int x_inc,
-                             const void* y, const int y_inc);
-double PUBLIC_API cblas_zdotc(const int n,
-                              const void* x, const int x_inc,
-                              const void* y, const int y_inc);
+void PUBLIC_API cblas_cdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
+void PUBLIC_API cblas_zdotc_sub(const int n,
+                                const void* x, const int x_inc,
+                                const void* y, const int y_inc,
+                                void* dot);
 
 // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
 float PUBLIC_API cblas_snrm2(const int n,
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 1a467340..5f0bb0d4 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -41,7 +41,7 @@ FILES = [
     "/include/clblast_netlib_c.h",
     "/src/clblast_netlib_c.cpp",
 ]
-HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32]
+HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32]
 FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2]
 
 # Different possibilities for requirements
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
index 7b7ece22..6bb3080f 100644
--- a/scripts/generator/generator/cpp.py
+++ b/scripts/generator/generator/cpp.py
@@ -112,6 +112,7 @@ def clblast_netlib_c_cc(routine):
         # There is a version available in CBLAS
         if flavour.precision_name in ["S", "D", "C", "Z"]:
             template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+            name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else ""
             indent = " " * (21 + routine.length() + len(template))
             result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL
 
@@ -129,6 +130,8 @@ def clblast_netlib_c_cc(routine):
             for i, name in enumerate(routine.inputs + routine.outputs):
                 buffer_type = routine.get_buffer_type(name, flavour)
                 result += "  " + routine.create_buffer(name, buffer_type) + NL
+                if name in routine.scalar_buffers_second_non_pointer():
+                    result += "  " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL
             for name in routine.inputs + routine.outputs:
                 if name not in routine.scalar_buffers_first():
                     prefix = "" if name in routine.outputs else "const "
@@ -148,14 +151,14 @@ def clblast_netlib_c_cc(routine):
 
             # Copy back and clean-up
             for name in routine.outputs:
-                if name in routine.scalar_buffers_first():
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
                     buffer_type = routine.get_buffer_type(name, flavour)
                     result += "  " + buffer_type + " " + name + "[" + name + "_size];" + NL
             for name in routine.outputs:
                 buffer_type = routine.get_buffer_type(name, flavour)
                 result += "  " + routine.read_buffer(name, buffer_type) + NL
             for name in routine.outputs:
-                if name in routine.scalar_buffers_first():
+                if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return():
                     result += "  return " + name + "[0]"
                     if flavour.buffer_type in ["float2", "double2"]:
                         if name not in routine.index_buffers():
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 391cf3e0..6fcce23b 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -43,6 +43,11 @@ class Routine:
         return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]
 
     @staticmethod
+    def scalar_buffers_second_non_pointer():
+        """As above, but these ones are not passed as pointers but as scalars instead"""
+        return ["sy1"]
+
+    @staticmethod
     def other_scalars():
         """List of scalars other than alpha and beta"""
         return ["cos", "sin"]
@@ -68,6 +73,10 @@ class Routine:
         return ["a", "b", "c", "ap"]
 
     @staticmethod
+    def routines_scalar_no_return():
+        return ["dotu", "dotc"]
+
+    @staticmethod
     def set_size(name, size):
         """Sets the size of a buffer"""
         return "const auto " + name + "_size = " + size + ";"
@@ -77,10 +86,12 @@ class Routine:
         """Creates a new CLCudaAPI buffer"""
         return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);"
 
-    @staticmethod
-    def write_buffer(name, template):
+    def write_buffer(self, name, template):
         """Writes to a CLCudaAPI buffer"""
-        data_structure = "reinterpret_cast<" + template + "*>(" + name + ")"
+        postfix = ""
+        if name in self.scalar_buffers_second_non_pointer():
+            postfix = "_vec"
+        data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")"
         return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");"
 
     @staticmethod
@@ -206,7 +217,8 @@ class Routine:
         prefix = "const " if name in self.inputs else ""
         if name in self.inputs or name in self.outputs:
             data_type = "void" if flavour.is_non_standard() else flavour.buffer_type
-            a = [prefix + data_type + "* " + name + ""]
+            pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*"
+            a = [prefix + data_type + pointer + " " + name + ""]
             c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
             return [", ".join(a + c)]
         return []
@@ -553,13 +565,16 @@ class Routine:
 
     def arguments_def_netlib(self, flavour):
         """As above, but for the Netlib CBLAS API"""
-        return (self.options_def_c() + self.sizes_def_netlib() +
+        result=(self.options_def_c() + self.sizes_def_netlib() +
                 self.scalar_def_void("alpha", flavour) +
                 list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) +
                 self.scalar_def_void("beta", flavour) +
                 list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) +
                 list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) +
                 list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+        if self.name in self.routines_scalar_no_return():
+            result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()]))
+        return result
 
     def arguments_def_c(self, flavour):
         """As above, but for the C API"""
@@ -654,11 +669,15 @@ class Routine:
             if output in self.index_buffers():
                 return_type = "int"
                 break
-            if output in self.scalar_buffers_first():
+            if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return():
                 return_type = flavour.buffer_type.replace("2", "")
                 break
         indent = " " * (spaces + len(return_type) + self.length())
-        result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "("
+        routine_name = self.name
+        if self.name in self.routines_scalar_no_return():
+            routine_name += "_sub"
+            indent += "    "
+        result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "("
         result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")"
         return result
 
diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp
index efff1712..66852e31 100644
--- a/src/clblast_netlib_c.cpp
+++ b/src/clblast_netlib_c.cpp
@@ -107,7 +107,7 @@ void cblas_drotg(double* sa,
 void cblas_srotmg(float* sd1,
                   float* sd2,
                   float* sx1,
-                  const float* sy1,
+                  const float sy1,
                   float* sparam) {
   auto device = get_device();
   auto context = clblast::Context(device);
@@ -118,11 +118,12 @@ void cblas_srotmg(float* sd1,
   const auto sx1_size = 1;
   const auto sparam_size = 1;
   auto sy1_buffer = clblast::Buffer<float>(context, sy1_size);
+  float sy1_vec[1]; sy1_vec[0] = sy1;
   auto sd1_buffer = clblast::Buffer<float>(context, sd1_size);
   auto sd2_buffer = clblast::Buffer<float>(context, sd2_size);
   auto sx1_buffer = clblast::Buffer<float>(context, sx1_size);
   auto sparam_buffer = clblast::Buffer<float>(context, sparam_size);
-  sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1));
+  sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1_vec));
   sd1_buffer.Write(queue, sd1_size, reinterpret_cast<float*>(sd1));
   sd2_buffer.Write(queue, sd2_size, reinterpret_cast<float*>(sd2));
   sx1_buffer.Write(queue, sx1_size, reinterpret_cast<float*>(sx1));
@@ -145,7 +146,7 @@ void cblas_srotmg(float* sd1,
 void cblas_drotmg(double* sd1,
                   double* sd2,
                   double* sx1,
-                  const double* sy1,
+                  const double sy1,
                   double* sparam) {
   auto device = get_device();
   auto context = clblast::Context(device);
@@ -156,11 +157,12 @@ void cblas_drotmg(double* sd1,
   const auto sx1_size = 1;
   const auto sparam_size = 1;
   auto sy1_buffer = clblast::Buffer<double>(context, sy1_size);
+  double sy1_vec[1]; sy1_vec[0] = sy1;
   auto sd1_buffer = clblast::Buffer<double>(context, sd1_size);
   auto sd2_buffer = clblast::Buffer<double>(context, sd2_size);
   auto sx1_buffer = clblast::Buffer<double>(context, sx1_size);
   auto sparam_buffer = clblast::Buffer<double>(context, sparam_size);
-  sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1));
+  sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1_vec));
   sd1_buffer.Write(queue, sd1_size, reinterpret_cast<double*>(sd1));
   sd2_buffer.Write(queue, sd2_size, reinterpret_cast<double*>(sd2));
   sx1_buffer.Write(queue, sx1_size, reinterpret_cast<double*>(sx1));
@@ -722,9 +724,10 @@ double cblas_ddot(const int n,
 }
 
 // DOTU
-float cblas_cdotu(const int n,
-                  const void* x, const int x_inc,
-                  const void* y, const int y_inc) {
+void cblas_cdotu_sub(const int n,
+                     const void* x, const int x_inc,
+                     const void* y, const int y_inc,
+                     void* dot) {
   auto device = get_device();
   auto context = clblast::Context(device);
   auto queue = clblast::Queue(context, device);
@@ -745,13 +748,12 @@ float cblas_cdotu(const int n,
   if (s != clblast::StatusCode::kSuccess) {
     throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
   }
-  float2 dot[dot_size];
   dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
-  return dot[0].real();
 }
-double cblas_zdotu(const int n,
-                   const void* x, const int x_inc,
-                   const void* y, const int y_inc) {
+void cblas_zdotu_sub(const int n,
+                     const void* x, const int x_inc,
+                     const void* y, const int y_inc,
+                     void* dot) {
   auto device = get_device();
   auto context = clblast::Context(device);
   auto queue = clblast::Queue(context, device);
@@ -772,15 +774,14 @@ double cblas_zdotu(const int n,
   if (s != clblast::StatusCode::kSuccess) {
     throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
   }
-  double2 dot[dot_size];
   dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
-  return dot[0].real();
 }
 
 // DOTC
-float cblas_cdotc(const int n,
-                  const void* x, const int x_inc,
-                  const void* y, const int y_inc) {
+void cblas_cdotc_sub(const int n,
+                     const void* x, const int x_inc,
+                     const void* y, const int y_inc,
+                     void* dot) {
   auto device = get_device();
   auto context = clblast::Context(device);
   auto queue = clblast::Queue(context, device);
@@ -801,13 +802,12 @@ float cblas_cdotc(const int n,
   if (s != clblast::StatusCode::kSuccess) {
     throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
   }
-  float2 dot[dot_size];
   dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot));
-  return dot[0].real();
 }
-double cblas_zdotc(const int n,
-                   const void* x, const int x_inc,
-                   const void* y, const int y_inc) {
+void cblas_zdotc_sub(const int n,
+                     const void* x, const int x_inc,
+                     const void* y, const int y_inc,
+                     void* dot) {
   auto device = get_device();
   auto context = clblast::Context(device);
   auto queue = clblast::Queue(context, device);
@@ -828,9 +828,7 @@ double cblas_zdotc(const int n,
   if (s != clblast::StatusCode::kSuccess) {
     throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s));
   }
-  double2 dot[dot_size];
   dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot));
-  return dot[0].real();
 }
 
 // NRM2