8 files changed, 219 insertions, 74 deletions
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
index 31aa8c4f..c4054669 100644
--- a/scripts/benchmark/benchmark.py
+++ b/scripts/benchmark/benchmark.py
@@ -17,9 +17,11 @@ import utils
 
 EXPERIMENTS = {
     "axpy": settings.AXPY,
+    "axpybatched": settings.AXPYBATCHED,
     "gemv": settings.GEMV,
     "gemm": settings.GEMM,
     "gemm_small": settings.GEMM_SMALL,
+    "gemmbatched": settings.GEMMBATCHED,
     "symm": settings.SYMM,
     "syrk": settings.SYRK,
     "summary": settings.SUMMARY,
@@ -60,51 +62,59 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
     return results
 
 
-def main(argv):
-
-    # Parses the command-line arguments
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % EXPERIMENTS.keys())
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device")
+    parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys()))
     parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
     parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
-    parser.add_argument("-n", "--num_runs", type=int, default=10, help="The number of benchmark repeats for averaging")
-    parser.add_argument("-x", "--precision", type=int, default=32,
-                        help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
     parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
-    parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
     parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
     parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
     cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk,
+                     plot_title, tight_plot, output_folder, verbose):
+
+    # Sanity check
+    if not os.path.isdir(output_folder):
+        print("[benchmark] Error: folder '%s' doesn't exist" % output_folder)
+        return
 
     # The benchmark name and plot title
-    benchmark_name = utils.precision_to_letter(cl_args.precision) + cl_args.benchmark.upper()
-    if cl_args.plot_title is None:
-        cl_args.plot_title = benchmark_name
+    benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
+    plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
 
     # Retrieves the benchmark settings
-    if cl_args.benchmark not in EXPERIMENTS.keys():
-        print("[benchmark] Invalid benchmark '%s', choose from %s" % (cl_args.benchmark, EXPERIMENTS.keys()))
+    if benchmark not in EXPERIMENTS.keys():
+        print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys()))
         return
-    experiment = EXPERIMENTS[cl_args.benchmark]
+    experiment = EXPERIMENTS[benchmark]
     benchmarks = experiment["benchmarks"]
 
     # Either run the benchmarks for this experiment or load old results from disk
-    json_file_name = benchmark_name.lower() + "_benchmarks.json"
-    if cl_args.load_from_disk and os.path.isfile(json_file_name):
+    json_file_name = os.path.join(output_folder, benchmark_name.lower() + "_benchmarks.json")
+    if load_from_disk and os.path.isfile(json_file_name):
         print("[benchmark] Loading previous benchmark results from '" + json_file_name + "'")
         with open(json_file_name) as f:
             results = json.load(f)
     else:
 
         # Runs all the individual benchmarks
-        print("[benchmark] Running on platform %d, device %d" % (cl_args.platform, cl_args.device))
-        print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), cl_args.benchmark))
+        print("[benchmark] Running on platform %d, device %d" % (platform, device))
+        print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark))
         results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
                    "num_cols": experiment["num_cols"], "benchmarks": []}
-        for benchmark in benchmarks:
-            print("[benchmark] Running benchmark '%s:%s'" % (benchmark["name"], benchmark["title"]))
-            result = run_benchmark(benchmark["name"], benchmark["arguments"], cl_args.precision, cl_args.num_runs,
-                                   cl_args.platform, cl_args.device)
+        for bench in benchmarks:
+            num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs
+            print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"]))
+            result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark,
+                                   platform, device)
             results["benchmarks"].append(result)
 
         # Stores the results to disk
@@ -113,8 +123,9 @@ def main(argv):
             json.dump(results, f, sort_keys=True, indent=4)
 
     # Retrieves the data from the benchmark settings
-    pdf_file_name = benchmark_name.lower() + "_plot.pdf"
-    titles = [utils.precision_to_letter(cl_args.precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
+    file_name_suffix = "_tight" if tight_plot else ""
+    pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf")
+    titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
     x_keys = [b["x_keys"] for b in benchmarks]
     y_keys = [b["y_keys"] for b in benchmarks]
     x_labels = [b["x_label"] for b in benchmarks]
@@ -122,17 +133,18 @@ def main(argv):
     label_names = results["label_names"]
 
     # For half-precision: also adds single-precision results for comparison
-    if cl_args.precision == 16:
+    if precision == 16:
         label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
         y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
 
     # Plots the graphs
     plot.plot_graphs(results["benchmarks"], pdf_file_name, results["num_rows"], results["num_cols"],
                      x_keys, y_keys, titles, x_labels, y_labels,
-                     label_names, cl_args.plot_title, cl_args.tight_plot, cl_args.verbose)
+                     label_names, plot_title, tight_plot, verbose)
 
     print("[benchmark] All done")
 
 
 if __name__ == '__main__':
-    main(sys.argv[1:])
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_single(**parsed_arguments)
diff --git a/scripts/benchmark/benchmark_all.py b/scripts/benchmark/benchmark_all.py
new file mode 100644
index 00000000..42cda1d7
--- /dev/null
+++ b/scripts/benchmark/benchmark_all.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import argparse
+import os
+import sys
+
+from benchmark import benchmark_single
+
+
+BENCHMARKS = ["axpy", "gemv", "gemm", "summary"]
+
+
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device")
+    parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
+    parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
+    parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-t", "--plot_title", default=None, help="The title for the plots, defaults to benchmark name")
+    parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    return vars(cl_args)
+
+
+def benchmark_all(platform, device, precision, load_from_disk,
+                  plot_title, output_folder, verbose):
+    for bench in BENCHMARKS:
+        from_disk = load_from_disk
+        for tight_plot in [True, False]:  # two plots for a single benchmark
+            benchmark_single(bench, platform, device, None, precision, from_disk,
+                             plot_title, tight_plot, output_folder, verbose)
+            from_disk = True  # for the next plot of the same data
+
+
+if __name__ == '__main__':
+    parsed_arguments = parse_arguments(sys.argv[1:])
+    benchmark_all(**parsed_arguments)
diff --git a/scripts/benchmark/plot.py b/scripts/benchmark/plot.py
index 275a3ba8..0cb6d8c5 100644
--- a/scripts/benchmark/plot.py
+++ b/scripts/benchmark/plot.py
@@ -6,6 +6,8 @@
 
 import utils
 
+import matplotlib
+matplotlib.use('Agg')
 from matplotlib import rcParams
 import matplotlib.pyplot as plt
 
@@ -112,5 +114,5 @@ def plot_graphs(results, file_name, num_rows, num_cols,
             leg.draw_frame(False)
 
     # Saves the plot to disk
+    print("[benchmark] Saving plot to '" + file_name + "'")
     fig.savefig(file_name, bbox_inches=bounding_box)
-    # plt.show()
diff --git a/scripts/benchmark/settings.py b/scripts/benchmark/settings.py
index cc7220e1..13b7c359 100644
--- a/scripts/benchmark/settings.py
+++ b/scripts/benchmark/settings.py
@@ -14,35 +14,35 @@ AXPY = {
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 40,
             "title": "multiples of 256K",
             "x_label": "sizes (n)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
         },
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 40,
             "title": "multiples of 256K+1",
             "x_label": "sizes (n)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
         },
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 40,
             "title": "around 1M",
             "x_label": "sizes (n)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 20,
             "title": "around 16M",
             "x_label": "sizes (n)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 20,
             "title": "strides n=8M",
             "x_label": "increments for x,y", "x_keys": ["incx", "incy"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
@@ -50,7 +50,7 @@ AXPY = {
                           for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
         },
         {
-            "name": "axpy",
+            "name": "axpy", "num_runs": 40,
             "title": "powers of 2",
             "x_label": "sizes (n)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
@@ -60,47 +60,76 @@ AXPY = {
     ]
 }
 
+AXPYBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "axpybatched", "num_runs": 40,
+            "title": "10 batches",
+            "x_label": "sizes (n)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 10, "n": utils.k(32), "incx": 1, "incy": 1, "step": utils.k(16), "num_steps": 16}],
+        },
+        {
+            "name": "axpybatched", "num_runs": 20,
+            "title": "50 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["n"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": 50, "n": utils.k(32), "incx": 1, "incy": 1, "step": utils.k(16), "num_steps": 16}],
+        },
+        {
+            "name": "axpybatched", "num_runs": 40,
+            "title": "n=128K",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "arguments": [{"batch_num": b, "n": utils.k(128), "incx": 1, "incy": 1, "step": 1, "num_steps": 1}
+                          for b in utils.powers_of_2(1, 128)],
+        }
+    ]
+}
+
 GEMV = {
     "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 40,
             "title": "multiples of 256",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
         },
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 40,
             "title": "multiples of 257",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
         },
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 20,
             "title": "around 4K",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 40,
             "title": "multiples of 256 rotated",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
         },
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 40,
             "title": "multiples of 257 rotated",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
             "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
         },
         {
-            "name": "gemv",
+            "name": "gemv", "num_runs": 20,
             "title": "strides n=m=4K",
             "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
             "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
@@ -115,7 +144,7 @@ GEMM = {
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 20,
             "title": "multiples of 128",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -123,7 +152,7 @@ GEMM = {
                            "transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 20,
             "title": "multiples of 129",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -131,7 +160,7 @@ GEMM = {
                            "transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 20,
             "title": "around 512",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -139,7 +168,7 @@ GEMM = {
                            "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -147,7 +176,7 @@ GEMM = {
                            "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 10,
             "title": "layouts/transpose",
             "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -156,7 +185,7 @@ GEMM = {
                           for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -172,7 +201,7 @@ GEMM_SMALL = {
     "num_rows": 2, "num_cols": 1,
     "benchmarks": [
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 10,
             "title": "small matrices in steps of 16",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -180,7 +209,7 @@ GEMM_SMALL = {
                            "transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
         },
         {
-            "name": "gemm",
+            "name": "gemm", "num_runs": 10,
             "title": "small matrices in steps of 1",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -191,12 +220,43 @@ GEMM_SMALL = {
     ]
 }
 
+GEMMBATCHED = {
+    "label_names": ["CLBlast", "clBLAS (non batched)"],
+    "num_rows": 1, "num_cols": 3,
+    "benchmarks": [
+        {
+            "name": "gemmbatched", "num_runs": 40,
+            "title": "10 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 10, "m": 16, "n": 16, "k": 16, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 16, "num_steps": 16}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 20,
+            "title": "50 batches",
+            "x_label": "sizes (m=n=k)", "x_keys": ["m"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": 50, "m": 16, "n": 16, "k": 16, "layout": 102,
+                           "transA": 111, "transB": 111, "step": 16, "num_steps": 16}],
+        },
+        {
+            "name": "gemmbatched", "num_runs": 40,
+            "title": "m=n=k=32",
+            "x_label": "number of batches", "x_keys": ["batch_num"],
+            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "arguments": [{"batch_num": b, "m": 32, "n": 32, "k": 32, "layout": 102,
+                           "transA": 111, "transB": 111} for b in utils.powers_of_2(1, 128)],
+        }
+    ]
+}
+
 SYMM = {
     "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "multiples of 128",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -204,7 +264,7 @@ SYMM = {
                            "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
         },
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "multiples of 129",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -212,7 +272,7 @@ SYMM = {
                            "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
         },
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "around 512",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -220,7 +280,7 @@ SYMM = {
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -228,7 +288,7 @@ SYMM = {
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "layouts/sides/triangles",
             "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -237,7 +297,7 @@ SYMM = {
                           for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
         },
         {
-            "name": "symm",
+            "name": "symm", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -253,7 +313,7 @@ SYRK = {
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "multiples of 128",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -261,7 +321,7 @@ SYRK = {
                            "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
         },
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "multiples of 129",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -269,7 +329,7 @@ SYRK = {
                            "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
         },
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "around 512",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -277,7 +337,7 @@ SYRK = {
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -285,7 +345,7 @@ SYRK = {
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "layouts/sides/triangles",
             "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
@@ -294,7 +354,7 @@ SYRK = {
                           for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
         },
         {
-            "name": "syrk",
+            "name": "syrk", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
             "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index f44bbce0..d30d4e55 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -36,12 +36,31 @@ void Xaxpy(const int n, const real_arg arg_alpha,
   }
 }
 
+// Faster version of the kernel without offsets and strided accesses but with if-statement. Also
+// assumes that 'n' is dividable by 'VW' and 'WPT'.
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFaster(const int n, const real_arg arg_alpha,
+                 const __global realV* restrict xgm,
+                 __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
+
+  if (get_global_id(0) < n / (VW)) {
+    #pragma unroll
+    for (int w=0; w<WPT; ++w) {
+      const int id = w*get_global_size(0) + get_global_id(0);
+      realV xvalue = xgm[id];
+      realV yvalue = ygm[id];
+      ygm[id] = MultiplyAddVector(yvalue, alpha, xvalue);
+    }
+  }
+}
+
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
-void XaxpyFast(const int n, const real_arg arg_alpha,
-               const __global realV* restrict xgm,
-               __global realV* ygm) {
+void XaxpyFastest(const int n, const real_arg arg_alpha,
+                  const __global realV* restrict xgm,
+                  __global realV* ygm) {
   const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 310562a0..0e588d99 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -44,18 +44,21 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   TestVectorY(n, y_buffer, y_offset, y_inc);
 
   // Determines whether or not the fast-version can be used
-  const auto use_fast_kernel = (x_offset == 0) && (x_inc == 1) &&
-                               (y_offset == 0) && (y_inc == 1) &&
-                               IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
+  const auto use_faster_kernel = (x_offset == 0) && (x_inc == 1) &&
+                                 (y_offset == 0) && (y_inc == 1) &&
+                                 IsMultiple(n, db_["WPT"]*db_["VW"]);
+  const auto use_fastest_kernel = use_faster_kernel &&
+                                  IsMultiple(n, db_["WGS"]*db_["WPT"]*db_["VW"]);
 
   // If possible, run the fast-version of the kernel
-  const auto kernel_name = (use_fast_kernel) ? "XaxpyFast" : "Xaxpy";
+  const auto kernel_name = (use_fastest_kernel) ? "XaxpyFastest" :
+                           (use_faster_kernel) ? "XaxpyFaster" : "Xaxpy";
 
   // Retrieves the Xaxpy kernel from the compiled binary
   auto kernel = Kernel(program_, kernel_name);
 
   // Sets the kernel arguments
-  if (use_fast_kernel) {
+  if (use_faster_kernel || use_fastest_kernel) {
     kernel.SetArgument(0, static_cast<int>(n));
     kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
@@ -73,11 +76,16 @@ void Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   }
 
   // Launches the kernel
-  if (use_fast_kernel) {
+  if (use_fastest_kernel) {
     auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
     auto local = std::vector<size_t>{db_["WGS"]};
     RunKernel(kernel, queue_, device_, global, local, event_);
   }
+  else if (use_faster_kernel) {
+    auto global = std::vector<size_t>{Ceil(CeilDiv(n, db_["WPT"]*db_["VW"]), db_["WGS"])};
+    auto local = std::vector<size_t>{db_["WGS"]};
+    RunKernel(kernel, queue_, device_, global, local, event_);
+  }
   else {
     const auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
     auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 23132c51..7984e184 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -27,7 +27,7 @@ class TuneXaxpy {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "xaxpy"; }
-  static std::string KernelName() { return "XaxpyFast"; }
+  static std::string KernelName() { return "XaxpyFastest"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
@@ -42,7 +42,7 @@ class TuneXaxpy {
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &args) {
     if (!IsMultiple(args.n, 64)) {
-      throw std::runtime_error("'XaxpyFast' requires 'n' to be a multiple of WGS*WPT*VW");
+      throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW");
     }
   }
 
@@ -52,7 +52,7 @@ class TuneXaxpy {
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
+  static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index 1bfcb623..7bc9c869 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -24,7 +24,7 @@ namespace clblast {
 template <typename T, typename U> const int TestBlas<T,U>::kSeed = 42; // fixed seed for reproducibility
 
 // Test settings for the regular test. Append to these lists in case more tests are required.
-template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 4096 };
+template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kVectorDims = { 7, 93, 144, 4096 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kIncrements = { 1, 2, 7 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixDims = { 7, 64 };
 template <typename T, typename U> const std::vector<size_t> TestBlas<T,U>::kMatrixVectorDims = { 61, 256 };