4 files changed, 598 insertions, 0 deletions
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
new file mode 100644
index 00000000..1574fdc4
--- /dev/null
+++ b/scripts/benchmark/benchmark.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import json
+import os
+import sys
+
+import settings
+import plot
+import utils
+
+EXPERIMENTS = {
+    "axpy": settings.AXPY,
+    "gemv": settings.GEMV,
+    "gemm": settings.GEMM,
+    "gemm_small": settings.GEMM_SMALL,
+    "symm": settings.SYMM,
+    "syrk": settings.SYRK,
+    "summary": settings.SUMMARY,
+}
+
+
+def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
+    binary = "./clblast_client_x" + name
+
+    # Loops over sub-benchmarks per benchmark
+    results = []
+    for arguments in arguments_list:
+
+        # Sets the arguments
+        constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0"]
+        common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs]
+        opencl_arguments = ["-platform %s" % platform, "-device %s" % device]
+        all_arguments = opencl_arguments + common_arguments + constant_arguments
+        for name, value in arguments.items():
+            all_arguments.append("-" + name + " " + str(value))
+
+        # Calls the binary and parses the results
+        benchmark_output = utils.run_binary(binary, all_arguments)
+        result = utils.parse_results(benchmark_output)
+
+        # For half-precision: also runs single-precision for comparison
+        if precision == 16:
+            all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
+            benchmark_output = utils.run_binary(binary, all_arguments)
+            result_extra = utils.parse_results(benchmark_output)
+            for index in range(len(min(result, result_extra))):
+                result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
+                result[index]["GBs_2"] = result_extra[index]["GBs_2"]
+                result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
+                result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"]
+
+        results.extend(result)
+    return results
+
+
+def main(argv):
+
+    # Parses the command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-b", "--benchmark", help="The benchmark to perform (choose from %s)" % EXPERIMENTS.keys())
+    parser.add_argument("-p", "--platform", help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", help="The ID of the OpenCL device to test on")
+    parser.add_argument("-n", "--num_runs", type=int, default=10, help="The number of benchmark repeats for averaging")
+    parser.add_argument("-x", "--precision", type=int, default=32,
+                        help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+
+    # The benchmark name and plot title
+    benchmark_name = utils.precision_to_letter(cl_args.precision) + cl_args.benchmark.upper()
+    if cl_args.plot_title is None:
+        cl_args.plot_title = benchmark_name
+
+    # Retrieves the benchmark settings
+    if cl_args.benchmark not in EXPERIMENTS.keys():
+        print("[benchmark] Invalid benchmark '%s', choose from %s" % (cl_args.benchmark, EXPERIMENTS.keys()))
+        return
+    experiment = EXPERIMENTS[cl_args.benchmark]
+    benchmarks = experiment["benchmarks"]
+
+    # Either run the benchmarks for this experiment or load old results from disk
+    json_file_name = benchmark_name.lower() + "_benchmarks.json"
+    if cl_args.load_from_disk and os.path.isfile(json_file_name):
+        print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'")
+        with open(json_file_name) as f:
+            results = json.load(f)
+    else:
+
+        # Runs all the individual benchmarks
+        print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), cl_args.benchmark))
+        results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
+                   "num_cols": experiment["num_cols"], "benchmarks": []}
+        for benchmark in benchmarks:
+            result = run_benchmark(benchmark["name"], benchmark["arguments"], cl_args.precision, cl_args.num_runs,
+                                   cl_args.platform, cl_args.device)
+            results["benchmarks"].append(result)
+
+        # Stores the results to disk
+        print("[benchmark] Saving benchmark results to '" + json_file_name + "'")
+        with open(json_file_name, "wb") as f:
+            json.dump(results, f, sort_keys=True, indent=4)
+
+    # Retrieves the data from the benchmark settings
+    pdf_file_name = benchmark_name.lower() + "_plot.pdf"
+    titles = [utils.precision_to_letter(cl_args.precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
+    x_keys = [b["x_keys"] for b in benchmarks]
+    y_keys = [b["y_keys"] for b in benchmarks]
+    x_labels = [b["x_label"] for b in benchmarks]
+    y_labels = [b["y_label"] for b in benchmarks]
+    label_names = results["label_names"]
+
+    # For half-precision: also adds single-precision results for comparison
+    if cl_args.precision == 16:
+        label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
+        y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
+
+    # Plots the graphs
+    plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"],
+                     x_keys, y_keys, titles, x_labels, y_labels,
+                     label_names, cl_args.plot_title, cl_args.verbose)
+
+    print("[benchmark] All done")
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/scripts/benchmark/plot.py b/scripts/benchmark/plot.py
new file mode 100644
index 00000000..dc4800fe
--- /dev/null
+++ b/scripts/benchmark/plot.py
@@ -0,0 +1,76 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+import matplotlib.pyplot as plt
+
+
+BLUEISH = [c / 255.0 for c in [71, 101, 177]]  # #4765b1
+REDISH = [c / 255.0 for c in [214, 117, 104]]  # #d67568
+PURPLISH = [c / 255.0 for c in [85, 0, 119]]  # #550077
+COLORS = [BLUEISH, REDISH, PURPLISH]
+MARKERS = ["o-", "x-", ".-"]
+
+
+def plot_graphs(results, file_name, num_rows, num_cols,
+                x_keys, y_keys, titles, x_labels, y_labels,
+                label_names, title, verbose):
+    assert len(results) == num_rows * num_cols
+    assert len(results) != 1
+    assert len(x_keys) == len(results)
+    assert len(y_keys) == len(results)
+    assert len(titles) == len(results)
+    assert len(x_labels) == len(results)
+    assert len(y_labels) == len(results)
+
+    # Initializes the plot
+    size_x = 6 * num_cols
+    size_y = 6 * num_rows
+    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
+    fig.text(.5, .93, title, horizontalalignment="center", fontsize=18)
+
+    # Loops over each subplot
+    for index, result in enumerate(results):
+        ax = axes.flat[index]
+        plt.sca(ax)
+        print("[plot] Plotting subplot %d" % index)
+
+        # Sets the x-axis labels
+        x_list = [[r[x_key] for r in result] for x_key in x_keys[index]]
+        x_ticks = [",".join([utils.float_to_kilo_mega(v) for v in values]) for values in zip(*x_list)]
+        x_location = range(len(x_ticks))
+
+        # Sets the y-data
+        y_list = [[r[y_key] for r in result] for y_key in y_keys[index]]
+        y_max = max([max(y) for y in y_list])
+
+        # Sets the axes
+        y_rounding = 10 if y_max < 80 else 50 if y_max < 400 else 200
+        y_axis_limit = (y_max * 1.2) - ((y_max * 1.2) % y_rounding) + y_rounding
+        plt.ylim(ymin=0, ymax=y_axis_limit)
+        plt.xticks(x_location, x_ticks, rotation='vertical')
+
+        # Sets the labels
+        ax.set_title(titles[index], fontsize=14, y=0.93)
+        ax.set_ylabel(y_labels[index], fontsize=14)
+        ax.set_xlabel(x_labels[index], fontsize=14)
+        ax.xaxis.set_label_coords(0.5, 0.06)
+
+        # Plots the graph
+        assert len(COLORS) >= len(y_keys[index])
+        assert len(MARKERS) >= len(y_keys[index])
+        assert len(label_names) == len(y_keys[index])
+        for i in range(len(y_keys[index])):
+            ax.plot(x_location, y_list[i], MARKERS[i], label=label_names[i], color=COLORS[i])
+
+        # Sets the legend
+        leg = ax.legend(loc=(0.02, 0.88 - 0.05 * len(y_keys[index])))
+        leg.draw_frame(False)
+
+    # Saves the plot to disk
+    fig.savefig(file_name, bbox_inches='tight')
+    plt.show()
diff --git a/scripts/benchmark/settings.py b/scripts/benchmark/settings.py
new file mode 100644
index 00000000..0243832f
--- /dev/null
+++ b/scripts/benchmark/settings.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import utils
+
+
+AXPY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpy",
+            "title": "multiples of 256K",
+            "x_label": "vector sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
+        },
+        {
+            "name": "axpy",
+            "title": "multiples of 256K+1",
+            "x_label": "vector sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy",
+            "title": "around n=1M",
+            "x_label": "vector sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy",
+            "title": "around n=16M",
+            "x_label": "vector sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "axpy",
+            "title": "strides (n=8M)",
+            "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        },
+        {
+            "name": "axpy",
+            "title": "powers of 2",
+            "x_label": "vector sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(utils.k(32), utils.m(64))],
+        }
+    ]
+}
+
+GEMV = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemv",
+            "title": "multiples of 256",
+            "x_label": "matrix/vector sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv",
+            "title": "multiples of 257",
+            "x_label": "matrix/vector sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv",
+            "title": "around n=m=4K",
+            "x_label": "matrix/vector sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemv",
+            "title": "multiples of 256 rotated",
+            "x_label": "matrix/vector sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
+        },
+        {
+            "name": "gemv",
+            "title": "multiples of 257 rotated",
+            "x_label": "matrix/vector sizes (n=m)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
+        },
+        {
+            "name": "gemv",
+            "title": "strides (n=m=4K)",
+            "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1}
+                          for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
+        }
+    ]
+}
+
+GEMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemm",
+            "title": "multiples of 128",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "gemm",
+            "title": "multiples of 129",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "gemm",
+            "title": "around m=n=k=512",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm",
+            "title": "around m=n=k=2048",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "gemm",
+            "title": "layouts/transposing (m=n=k=1K)",
+            "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout,
+                           "transA": transA, "transB": transB, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
+        },
+        {
+            "name": "gemm",
+            "title": "powers of 2",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "k": n, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+GEMM_SMALL = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 1,
+    "benchmarks": [
+        {
+            "name": "gemm",
+            "title": "small matrices in steps of 16",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
+        },
+        {
+            "name": "gemm",
+            "title": "small matrices in steps of 1",
+            "x_label": "matrix sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 1, "num_steps": 385}],
+        },
+
+    ]
+}
+
+SYMM = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "symm",
+            "title": "multiples of 128",
+            "x_label": "matrix sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 128, "n": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "symm",
+            "title": "multiples of 129",
+            "x_label": "matrix sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 129, "n": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "symm",
+            "title": "around m=n=512",
+            "x_label": "matrix sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 512, "n": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm",
+            "title": "around m=n=2048",
+            "x_label": "matrix sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 2048, "n": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "symm",
+            "title": "layouts/sides/triangles (m=n=1K)",
+            "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": 1024, "n": 1024, "layout": layout,
+                           "side": side, "triangle": triangle, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
+        },
+        {
+            "name": "symm",
+            "title": "powers of 2",
+            "x_label": "matrix sizes (m=n)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"m": n, "n": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SYRK = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 2, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "syrk",
+            "title": "multiples of 128",
+            "x_label": "matrix sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 128, "k": 128, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
+        },
+        {
+            "name": "syrk",
+            "title": "multiples of 129",
+            "x_label": "matrix sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 129, "k": 129, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
+        },
+        {
+            "name": "syrk",
+            "title": "around n=k=512",
+            "x_label": "matrix sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 512, "k": 512, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk",
+            "title": "around n=k=2048",
+            "x_label": "matrix sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 2048, "k": 2048, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
+        },
+        {
+            "name": "syrk",
+            "title": "layouts/sides/triangles (n=k=1K)",
+            "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": 1024, "k": 1024, "layout": layout,
+                           "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1}
+                          for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
+        },
+        {
+            "name": "syrk",
+            "title": "powers of 2",
+            "x_label": "matrix sizes (n=k)", "x_keys": ["n"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"n": n, "k": n, "layout": 102,
+                           "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
+                          for n in utils.powers_of_2(8, utils.k(4))],
+        }
+    ]
+}
+
+SUMMARY = {
+    "label_names": ["CLBlast", "clBLAS"],
+    "num_rows": 4, "num_cols": 2,
+    "benchmarks": [
+        AXPY["benchmarks"][0],
+        AXPY["benchmarks"][1],
+        GEMV["benchmarks"][0],
+        GEMV["benchmarks"][1],
+        GEMM["benchmarks"][0],
+        GEMM["benchmarks"][1],
+        SYMM["benchmarks"][0],
+        SYMM["benchmarks"][1],
+    ]
+}
diff --git a/scripts/benchmark/utils.py b/scripts/benchmark/utils.py
new file mode 100644
index 00000000..62e18de2
--- /dev/null
+++ b/scripts/benchmark/utils.py
@@ -0,0 +1,66 @@
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import csv
+import subprocess
+
+
+def k(value):
+    return value * 1024
+
+
+def m(value):
+    return value * 1024 * 1024
+
+
+def float_to_kilo_mega(value):
+    if value % 1024 or value <= 1024:
+        return "%.0f" % value
+    elif value % (1024 * 1024) or value <= (1024 * 1024):
+        return "%.0fK" % (value / 1024.0)
+    else:
+        return "%.0fM" % (value / (1024.0 * 1024.0))
+
+
+def powers_of_2(start, stop):
+    while start <= stop:
+        yield start
+        start *= 2
+
+
+def precision_to_letter(precision):
+    if precision == 16:
+        return "H"
+    elif precision == 32:
+        return "S"
+    elif precision == 64:
+        return "D"
+    elif precision == 3232:
+        return "C"
+    elif precision == 6464:
+        return "Z"
+    else:
+        return "X"
+
+
+def run_binary(command, arguments):
+    full_command = command + " " + " ".join(arguments)
+    print("[benchmark] Calling binary: %s" % str(full_command))
+    try:
+        return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+    except OSError as e:
+        print("[benchmark] Error while running the binary, got exception: %s" + str(e))
+        return False
+
+
+def parse_results(csv_data):
+    csv_data = csv_data.split("\n")
+    results = csv.DictReader(csv_data, delimiter=";", skipinitialspace=True)
+    results = [r for r in results]
+    for result in results:
+        for key in result:
+            result[key] = float(result[key]) if "." in result[key] else int(result[key])
+    return results