scripts/generator/generator/pyclblast.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
# PEP8 Python style guide and uses a max-width of 120 characters per line.
#
# Author(s):
#   Cedric Nugteren <www.cedricnugteren.nl>

import os


NL = os.linesep
SEPARATOR = "####################################################################################################"


def to_np_dtype(flavour):
    return {
        "S": "float32",
        "D": "float64",
        "C": "complex64",
        "Z": "complex128",
        "H": "float16",
    }[flavour.precision_name]


def cl_type(flavour):
    return {
        "S": "cl_float",
        "D": "cl_double",
        "C": "cl_float2",
        "Z": "cl_double2",
        "H": "cl_half",
    }[flavour.precision_name]


def scalar_cython_conversion(scalar, flavour):
    scalar_type = flavour.alpha_cl if scalar == "alpha" else flavour.beta_cl
    if scalar_type == "float":
        return "<cl_float>" + scalar
    if scalar_type == "double":
        return "<cl_double>" + scalar
    if scalar_type in ["cl_float2", "float2"]:
        return "<cl_float2>cl_float2(x=" + scalar + ".real,y=" + scalar + ".imag)"
    if scalar_type in ["cl_double2", "double2"]:
        return "<cl_double2>cl_double2(x=" + scalar + ".real,y=" + scalar + ".imag)"
    if scalar_type in ["cl_half", "half"]:
        return "<cl_half>" + scalar
    raise RuntimeError("Could not convert flavour '%s:%s'" % (flavour.precision_name, scalar_type))


def generate_pyx(routine):
    result = ""
    if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3", "x"]:
        if routine.level == "x" and routine.batched == 0:
            return result # level-X routines that are non-batched are not supported at the moment
        indent = "    "

        result += SEPARATOR + NL
        result += "# " + routine.description + ": " + routine.short_names() + NL
        result += SEPARATOR + NL
        result += NL

        # Reference C definition
        result += "cdef extern from \"clblast_c.h\":" + NL
        np_dtypes = []
        for flavour in routine.flavours:
            if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
                result += indent + "CLBlastStatusCode CLBlast" + flavour.name + routine.plain_name() + "("
                result += ", ".join(routine.arguments_def_c(flavour)) + ","
                result += "cl_command_queue* queue, cl_event* event)" + NL
                np_dtypes.append(to_np_dtype(flavour))
        result += "" + NL

        # Function definition
        buffers = routine.inputs[:] + routine.outputs[:]
        result += "def " + routine.plain_name() + "(queue, "
        result += ", ".join(routine.arguments_python()) + "):" + NL

        # Documentation
        result += indent + "\"\"\"" + NL
        result += indent + "x" + routine.upper_name() + ": " + routine.description + NL
        result += indent + "\"\"\"" + NL
        result += NL

        # Data types and checks
        result += indent + "dtype = check_dtype([" + ", ".join(buffers) + "], "
        result += "[" + ", ".join(['"%s"' % d for d in np_dtypes]) + "])" + NL
        for buf in buffers:
            if buf in routine.buffers_vector():
                result += indent + "check_vector("
            else:
                result += indent + "check_matrix("
            result += buf + ", \"" + buf + "\")" + NL
        result += NL

        # Batched checks
        if routine.batched == 1:  # batched but not strided-batched
            lists = [b + "_offsets" for b in buffers] + [s + "s" for s in routine.scalars]
            result += indent + "if " + " != ".join(["len(" + l + ")" for l in lists]) + ":" + NL
            result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: length of batch-sized arguments " + ", ".join(lists) + " should be equal\")" + NL
            result += indent + "batch_count = len(" + lists[0] + ")" + NL
            result += NL

            # Batched list to pointer conversions
            for buf in buffers:
                result += indent + "cdef size_t *" + buf + "_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))" + NL
                result += indent + "for i in range(batch_count):" + NL
                result += indent + indent + "" + buf + "_offsets_c[i] = " + buf + "_offsets[i]" + NL
            for scalar in routine.scalars:
                result += indent + "cdef void *" + scalar + "s_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))" + NL
                result += indent + "for i in range(batch_count):" + NL
                if_prefix = ""
                for flavour in routine.flavours:
                    if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
                        np_dtype = to_np_dtype(flavour)
                        result += indent + indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
                        scalar_converted = scalar_cython_conversion(scalar + "s[i]", flavour)
                        result += indent + indent + indent + "(<" + cl_type(flavour) + "*>" + scalar + "s_c)[i] = " + scalar_converted + NL
                        if_prefix = "el"

            result += NL

        # Buffer transformation
        for buf in buffers:
            result += indent + "cdef cl_mem " + buf + "_buffer = <cl_mem><size_t>" + buf + ".base_data.int_ptr" + NL
        result += NL

        result += indent + "cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr" + NL
        result += indent + "cdef cl_event event = NULL" + NL

        for option in routine.options:
            if option == "a_transpose":
                result += indent + "a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo" + NL
            if option == "b_transpose":
                result += indent + "b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo" + NL
            if option == "ab_transpose":
                result += indent + "ab_transpose = CLBlastTransposeYes if ab_transp else CLBlastTransposeNo" + NL
            if option == "side":
                result += indent + "side = CLBlastSideRight if right_side else CLBlastSideLeft" + NL
            if option == "triangle":
                result += indent + "triangle = CLBlastTriangleLower if lower_triangle else CLBlastTriangleUpper" + NL
            if option == "diagonal":
                result += indent + "diagonal = CLBlastDiagonalUnit if unit_diagonal else CLBlastDiagonalNonUnit" + NL

        result += "" + NL
        result += indent + "cdef CLBlastStatusCode err" + NL
        if_prefix = ""
        for flavour in routine.flavours:
            if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
                np_dtype = to_np_dtype(flavour)
                if routine.batched != 1:  # regular or strided-batched
                    argument_names = [x.
                                      replace("layout", "CLBlastLayoutRowMajor").
                                      replace("alpha", scalar_cython_conversion("alpha", flavour)).
                                      replace("beta", scalar_cython_conversion("beta", flavour))
                                      for x in routine.arguments()]
                else:  # batched but not strided-batched
                    argument_names = [x.
                                      replace("layout", "CLBlastLayoutRowMajor").
                                      replace("_cpp", "_c").
                                      replace("_offsets", "_offsets_c").
                                      replace("alphas_c", "<" + cl_type(flavour) + "*>alphas_c").
                                      replace("betas_c", "<" + cl_type(flavour) + "*>betas_c")
                                      for x in routine.arguments()]
                if routine.batched > 0:
                    argument_names.append("batch_count")
                result += indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
                result += indent + indent + "err = CLBlast" + flavour.name + routine.plain_name()
                result += "(" + ", ".join(argument_names) + ", &command_queue, &event)" + NL
                if_prefix = "el"

        result += indent + "else:" + NL
        result += indent + indent + "raise ValueError(\"PyCLBlast: Unrecognized data-type '%s'\" % dtype)" + NL
        result += NL

        # Cleaning up
        if routine.batched == 1:  # batched but not strided-batched
            for array in [b + "_offset" for b in buffers] + routine.scalars:
                result += indent + "PyMem_Free(" + array + "s_c)" + NL
            result += NL

        result += indent + "if err != CLBlastSuccess:" + NL
        result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: %s\" % get_status_message(err))" + NL
        result += indent + "return cl.Event.from_int_ptr(<size_t>event)" + NL
        result += NL

    return result