diff options
-rw-r--r-- | include/clblast_netlib_c.h | 50 | ||||
-rwxr-xr-x | scripts/generator/generator.py | 2 | ||||
-rw-r--r-- | scripts/generator/generator/cpp.py | 7 | ||||
-rw-r--r-- | scripts/generator/generator/routine.py | 33 | ||||
-rw-r--r-- | src/clblast_netlib_c.cpp | 46 |
5 files changed, 90 insertions, 48 deletions
diff --git a/include/clblast_netlib_c.h b/include/clblast_netlib_c.h index 0a38abb2..b5577cfa 100644 --- a/include/clblast_netlib_c.h +++ b/include/clblast_netlib_c.h @@ -46,6 +46,24 @@ typedef enum CLBlastDiagonal_ { CLBlastDiagonalNonUnit = 131, CLBlastDiagonalUnit = 132 } CLBlastDiagonal; typedef enum CLBlastSide_ { CLBlastSideLeft = 141, CLBlastSideRight = 142 } CLBlastSide; +// For full compatibility with CBLAS +typedef CLBlastLayout CBLAS_ORDER; +typedef CLBlastTranspose CBLAS_TRANSPOSE; +typedef CLBlastTriangle CBLAS_UPLO; +typedef CLBlastDiagonal CBLAS_DIAG; +typedef CLBlastSide CBLAS_SIDE; +#define CblasRowMajor CLBlastLayoutRowMajor +#define CblasColMajor CLBlastLayoutColMajor +#define CblasNoTrans CLBlastTransposeNo +#define CblasTrans CLBlastTransposeYes +#define CblasConjTrans CLBlastTransposeConjugate +#define CblasUpper CLBlastTriangleUpper +#define CblasLower CLBlastTriangleLower +#define CblasNonUnit CLBlastDiagonalNonUnit +#define CblasUnit CLBlastDiagonalUnit +#define CblasLeft CLBlastSideLeft +#define CblasRight CLBlastSideRight + // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= @@ -64,12 +82,12 @@ void PUBLIC_API cblas_drotg(double* sa, void PUBLIC_API cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam); void PUBLIC_API cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam); // Apply givens plane rotation: SROT/DROT @@ -163,20 +181,24 @@ double PUBLIC_API cblas_ddot(const int n, const double* y, const int y_inc); // Dot product of two complex vectors: CDOTU/ZDOTU -float PUBLIC_API cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC -float PUBLIC_API cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); -double PUBLIC_API cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc); +void PUBLIC_API cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); +void PUBLIC_API cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 float PUBLIC_API cblas_snrm2(const int n, diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 1a467340..5f0bb0d4 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -41,7 +41,7 @@ FILES = [ "/include/clblast_netlib_c.h", "/src/clblast_netlib_c.cpp", ] -HEADER_LINES = [117, 73, 118, 22, 29, 41, 47, 32] +HEADER_LINES = [117, 73, 118, 22, 29, 41, 65, 32] FOOTER_LINES = [17, 80, 19, 18, 6, 6, 9, 2] # Different possibilities for requirements diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py index 7b7ece22..6bb3080f 100644 --- a/scripts/generator/generator/cpp.py +++ b/scripts/generator/generator/cpp.py @@ -112,6 +112,7 @@ def clblast_netlib_c_cc(routine): # There is a version available in CBLAS if flavour.precision_name in ["S", "D", "C", "Z"]: template = "<" + flavour.template + ">" if routine.no_scalars() else "" + name_postfix = "_sub" if routine.name in routine.routines_scalar_no_return() else "" indent = " " * (21 + routine.length() + len(template)) result += routine.routine_header_netlib(flavour, 9, "") + " {" + NL @@ -129,6 +130,8 @@ def clblast_netlib_c_cc(routine): for i, name in enumerate(routine.inputs + routine.outputs): buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.create_buffer(name, buffer_type) + NL + if name in routine.scalar_buffers_second_non_pointer(): + result += " " + buffer_type + " " + name + "_vec[1]; " + name + "_vec[0] = " + name + ";" + NL for name in routine.inputs + routine.outputs: if name not in routine.scalar_buffers_first(): prefix = "" if name in routine.outputs else "const " @@ -148,14 +151,14 @@ def clblast_netlib_c_cc(routine): # Copy back and clean-up for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): buffer_type = routine.get_buffer_type(name, flavour) result += " " + buffer_type + " " + name + "[" + name + "_size];" + NL for name in routine.outputs: buffer_type = routine.get_buffer_type(name, flavour) result += " " + routine.read_buffer(name, buffer_type) + NL for name in routine.outputs: - if name in routine.scalar_buffers_first(): + if name in routine.scalar_buffers_first() and routine.name not in routine.routines_scalar_no_return(): result += " return " + name + "[0]" if flavour.buffer_type in ["float2", "double2"]: if name not in routine.index_buffers(): diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py index 391cf3e0..6fcce23b 100644 --- a/scripts/generator/generator/routine.py +++ b/scripts/generator/generator/routine.py @@ -43,6 +43,11 @@ class Routine: return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"] @staticmethod + def scalar_buffers_second_non_pointer(): + """As above, but these ones are not passed as pointers but as scalars instead""" + return ["sy1"] + + @staticmethod def other_scalars(): """List of scalars other than alpha and beta""" return ["cos", "sin"] @@ -68,6 +73,10 @@ class Routine: return ["a", "b", "c", "ap"] @staticmethod + def routines_scalar_no_return(): + return ["dotu", "dotc"] + + @staticmethod def set_size(name, size): """Sets the size of a buffer""" return "const auto " + name + "_size = " + size + ";" @@ -77,10 +86,12 @@ class Routine: """Creates a new CLCudaAPI buffer""" return "auto " + name + "_buffer = clblast::Buffer<" + template + ">(context, " + name + "_size);" - @staticmethod - def write_buffer(name, template): + def write_buffer(self, name, template): """Writes to a CLCudaAPI buffer""" - data_structure = "reinterpret_cast<" + template + "*>(" + name + ")" + postfix = "" + if name in self.scalar_buffers_second_non_pointer(): + postfix = "_vec" + data_structure = "reinterpret_cast<" + template + "*>(" + name + postfix + ")" return name + "_buffer.Write(queue, " + name + "_size, " + data_structure + ");" @staticmethod @@ -206,7 +217,8 @@ class Routine: prefix = "const " if name in self.inputs else "" if name in self.inputs or name in self.outputs: data_type = "void" if flavour.is_non_standard() else flavour.buffer_type - a = [prefix + data_type + "* " + name + ""] + pointer = "" if name in self.scalar_buffers_second_non_pointer() else "*" + a = [prefix + data_type + pointer + " " + name + ""] c = ["const int " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else [] return [", ".join(a + c)] return [] @@ -553,13 +565,16 @@ class Routine: def arguments_def_netlib(self, flavour): """As above, but for the Netlib CBLAS API""" - return (self.options_def_c() + self.sizes_def_netlib() + + result=(self.options_def_c() + self.sizes_def_netlib() + self.scalar_def_void("alpha", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_first()])) + self.scalar_def_void("beta", flavour) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.buffers_second()])) + list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_second()])) + list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()]))) + if self.name in self.routines_scalar_no_return(): + result += list(chain(*[self.buffer_def_pointer(b, flavour) for b in self.scalar_buffers_first()])) + return result def arguments_def_c(self, flavour): """As above, but for the C API""" @@ -654,11 +669,15 @@ class Routine: if output in self.index_buffers(): return_type = "int" break - if output in self.scalar_buffers_first(): + if output in self.scalar_buffers_first() and self.name not in self.routines_scalar_no_return(): return_type = flavour.buffer_type.replace("2", "") break indent = " " * (spaces + len(return_type) + self.length()) - result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + self.name + "(" + routine_name = self.name + if self.name in self.routines_scalar_no_return(): + routine_name += "_sub" + indent += " " + result = return_type + extra_qualifier + " cblas_" + flavour.name.lower() + routine_name + "(" result += (",\n" + indent).join([a for a in self.arguments_def_netlib(flavour)]) + ")" return result diff --git a/src/clblast_netlib_c.cpp b/src/clblast_netlib_c.cpp index efff1712..66852e31 100644 --- a/src/clblast_netlib_c.cpp +++ b/src/clblast_netlib_c.cpp @@ -107,7 +107,7 @@ void cblas_drotg(double* sa, void cblas_srotmg(float* sd1, float* sd2, float* sx1, - const float* sy1, + const float sy1, float* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -118,11 +118,12 @@ void cblas_srotmg(float* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer<float>(context, sy1_size); + float sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer<float>(context, sd1_size); auto sd2_buffer = clblast::Buffer<float>(context, sd2_size); auto sx1_buffer = clblast::Buffer<float>(context, sx1_size); auto sparam_buffer = clblast::Buffer<float>(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const float*>(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast<float*>(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast<float*>(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast<float*>(sx1)); @@ -145,7 +146,7 @@ void cblas_srotmg(float* sd1, void cblas_drotmg(double* sd1, double* sd2, double* sx1, - const double* sy1, + const double sy1, double* sparam) { auto device = get_device(); auto context = clblast::Context(device); @@ -156,11 +157,12 @@ void cblas_drotmg(double* sd1, const auto sx1_size = 1; const auto sparam_size = 1; auto sy1_buffer = clblast::Buffer<double>(context, sy1_size); + double sy1_vec[1]; sy1_vec[0] = sy1; auto sd1_buffer = clblast::Buffer<double>(context, sd1_size); auto sd2_buffer = clblast::Buffer<double>(context, sd2_size); auto sx1_buffer = clblast::Buffer<double>(context, sx1_size); auto sparam_buffer = clblast::Buffer<double>(context, sparam_size); - sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1)); + sy1_buffer.Write(queue, sy1_size, reinterpret_cast<const double*>(sy1_vec)); sd1_buffer.Write(queue, sd1_size, reinterpret_cast<double*>(sd1)); sd2_buffer.Write(queue, sd2_size, reinterpret_cast<double*>(sd2)); sx1_buffer.Write(queue, sx1_size, reinterpret_cast<double*>(sx1)); @@ -722,9 +724,10 @@ double cblas_ddot(const int n, } // DOTU -float cblas_cdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -745,13 +748,12 @@ float cblas_cdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot)); - return dot[0].real(); } -double cblas_zdotu(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotu_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -772,15 +774,14 @@ double cblas_zdotu(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot)); - return dot[0].real(); } // DOTC -float cblas_cdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_cdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -801,13 +802,12 @@ float cblas_cdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - float2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast<float2*>(dot)); - return dot[0].real(); } -double cblas_zdotc(const int n, - const void* x, const int x_inc, - const void* y, const int y_inc) { +void cblas_zdotc_sub(const int n, + const void* x, const int x_inc, + const void* y, const int y_inc, + void* dot) { auto device = get_device(); auto context = clblast::Context(device); auto queue = clblast::Queue(context, device); @@ -828,9 +828,7 @@ double cblas_zdotc(const int n, if (s != clblast::StatusCode::kSuccess) { throw std::runtime_error("CLBlast returned with error code " + clblast::ToString(s)); } - double2 dot[dot_size]; dot_buffer.Read(queue, dot_size, reinterpret_cast<double2*>(dot)); - return dot[0].real(); } // NRM2 |