6 files changed, 95 insertions, 68 deletions
diff --git a/CHANGELOG b/CHANGELOG
index e2f0d872..d49cb3f5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,7 @@ Development (next version)
 - Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers
   which don't do this themselves (ARM Mali) - greatly improves performance on these platforms
 - Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines
+- Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel)
 - Improved compilation time by splitting the tuning database into multiple compilation units
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
diff --git a/README.md b/README.md
index fe69e873..db7e16e8 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ CLBlast: The tuned OpenCL BLAS library
 | Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/) |
 | Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/) |
 | OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/) | N/A | N/A |
+
 (*Note*: Automated correctness tests currently not running, servers are offline)
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
index 3239d385..d0a9d80f 100644
--- a/scripts/benchmark/benchmark.py
+++ b/scripts/benchmark/benchmark.py
@@ -27,8 +27,12 @@ EXPERIMENTS = {
     "summary": settings.SUMMARY,
 }
 
+COMPARISONS = ["clBLAS", "CPU-BLAS", "cuBLAS"]
+COMPARISON_ARGS = ["-clblas", "-cblas", "-cublas"]
+COMPARISON_IDS = [2, 3, 4]
 
-def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
+
+def run_benchmark(name, arguments_list, precision, num_runs, platform, device, comparisons):
     binary = "./clblast_client_x" + name
 
     # Loops over sub-benchmarks per benchmark
@@ -36,10 +40,16 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
     for arguments in arguments_list:
 
         # Sets the arguments
-        constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0", "-cublas 0"]
+        constant_arguments = ["-warm_up", "-q", "-no_abbrv"]
         common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs]
         opencl_arguments = ["-platform %d" % platform, "-device %d" % device]
-        all_arguments = opencl_arguments + common_arguments + constant_arguments
+        comparison_arguments = []
+        for name, arg in zip(COMPARISONS, COMPARISON_ARGS):
+            if name in comparisons:
+                comparison_arguments.append(arg + " 1")
+            else:
+                comparison_arguments.append(arg + " 0")
+        all_arguments = opencl_arguments + common_arguments + constant_arguments + comparison_arguments
         for name, value in arguments.items():
             all_arguments.append("-" + name + " " + str(value))
 
@@ -54,9 +64,11 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
             result_extra = utils.parse_results(benchmark_output)
             for index in range(len(min(result, result_extra))):
                 result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
-                result[index]["GBs_2"] = result_extra[index]["GBs_2"]
                 result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
-                result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"]
+                for id in COMPARISON_IDS:
+                    if "GBs_%d" % id in result_extra[index].keys():
+                        result[index]["GBs_%d" % id] = result_extra[index]["GBs_%d" % id]
+                        result[index]["GFLOPS_%d" % id] = result_extra[index]["GFLOPS_%d" % id]
 
         results.extend(result)
     return results
@@ -65,6 +77,7 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device):
 def parse_arguments(argv):
     parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device")
     parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys()))
+    parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS)
     parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
     parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
     parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
@@ -78,7 +91,7 @@ def parse_arguments(argv):
     return vars(cl_args)
 
 
-def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk,
+def benchmark_single(benchmark, comparisons, platform, device, num_runs, precision, load_from_disk,
                      plot_title, tight_plot, output_folder, verbose):
 
     # Sanity check
@@ -91,6 +104,14 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from
     if benchmark.upper() != "SUMMARY":
         plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
 
+    # Retrieves the comparison settings
+    library_ids = [1]
+    for comparison in comparisons:
+        if comparison not in COMPARISONS:
+            print("[benchmark] Invalid comparison library '%s', choose from %s" % (comparison, COMPARISONS))
+            return
+        library_ids.append(COMPARISON_IDS[COMPARISONS.index(comparison)])
+
     # Retrieves the benchmark settings
     if benchmark not in EXPERIMENTS.keys():
         print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys()))
@@ -109,13 +130,13 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from
         # Runs all the individual benchmarks
         print("[benchmark] Running on platform %d, device %d" % (platform, device))
         print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark))
-        results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"],
+        results = {"label_names": ["CLBlast"] + comparisons, "num_rows": experiment["num_rows"],
                    "num_cols": experiment["num_cols"], "benchmarks": []}
         for bench in benchmarks:
             num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs
             print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"]))
             result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark,
-                                   platform, device)
+                                   platform, device, comparisons)
             results["benchmarks"].append(result)
 
         # Stores the results to disk
@@ -128,14 +149,17 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from
     pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf")
     titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks]
     x_keys = [b["x_keys"] for b in benchmarks]
-    y_keys = [b["y_keys"] for b in benchmarks]
+    y_keys = [["%s_%d" % (b["y_key"], i) for i in library_ids] for b in benchmarks]
     x_labels = [b["x_label"] for b in benchmarks]
     y_labels = [b["y_label"] for b in benchmarks]
     label_names = results["label_names"]
 
     # For half-precision: also adds single-precision results for comparison
     if precision == 16:
-        label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"]
+        label_names[0] += " FP16"
+        for index in range(1, len(label_names)):
+            label_names[index] += " FP32"
+        label_names.append("CLBlast FP32")
         y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys]
 
     # Plots the graphs
diff --git a/scripts/benchmark/benchmark_all.py b/scripts/benchmark/benchmark_all.py
index 9bf09190..2a7f6c9a 100644
--- a/scripts/benchmark/benchmark_all.py
+++ b/scripts/benchmark/benchmark_all.py
@@ -10,7 +10,7 @@ import argparse
 import os
 import sys
 
-from benchmark import benchmark_single
+from benchmark import benchmark_single, COMPARISONS
 
 
 BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"]
@@ -18,6 +18,7 @@ BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"]
 
 def parse_arguments(argv):
     parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device")
+    parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS)
     parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on")
     parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
     parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
@@ -29,12 +30,12 @@ def parse_arguments(argv):
     return vars(cl_args)
 
 
-def benchmark_all(platform, device, precision, load_from_disk,
+def benchmark_all(comparisons, platform, device, precision, load_from_disk,
                   plot_title, output_folder, verbose):
     for bench in BENCHMARKS:
         from_disk = load_from_disk
         for tight_plot in [True, False]:  # two plots for a single benchmark
-            benchmark_single(bench, platform, device, None, precision, from_disk,
+            benchmark_single(bench, comparisons, platform, device, None, precision, from_disk,
                              plot_title, tight_plot, output_folder, verbose)
             from_disk = True  # for the next plot of the same data
 
diff --git a/scripts/benchmark/settings.py b/scripts/benchmark/settings.py
index dae1854f..d0d17178 100644
--- a/scripts/benchmark/settings.py
+++ b/scripts/benchmark/settings.py
@@ -10,42 +10,41 @@ import utils
 
 
 AXPY = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
             "name": "axpy", "num_runs": 40,
             "title": "multiples of 256K",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}],
         },
         {
             "name": "axpy", "num_runs": 40,
             "title": "multiples of 256K+1",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}],
         },
         {
             "name": "axpy", "num_runs": 40,
             "title": "around 1M",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
         },
         {
             "name": "axpy", "num_runs": 20,
             "title": "around 16M",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}],
         },
         {
             "name": "axpy", "num_runs": 20,
             "title": "strides n=8M",
             "x_label": "increments for x,y", "x_keys": ["incx", "incy"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1}
                           for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
         },
@@ -53,7 +52,7 @@ AXPY = {
             "name": "axpy", "num_runs": 40,
             "title": "powers of 2",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(utils.k(32), utils.m(64))],
         }
@@ -61,14 +60,13 @@ AXPY = {
 }
 
 AXPYBATCHED = {
-    "label_names": ["CLBlast", "clBLAS (non batched)"],
     "num_rows": 1, "num_cols": 3,
     "benchmarks": [
         {
             "name": "axpybatched", "num_runs": 30,
             "title": "8 AXPYs",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(utils.k(8), utils.m(4))],
         },
@@ -76,7 +74,7 @@ AXPYBATCHED = {
             "name": "axpybatched", "num_runs": 20,
             "title": "64 AXPYs",
             "x_label": "sizes (n)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(utils.k(8), utils.m(4))],
         },
@@ -84,7 +82,7 @@ AXPYBATCHED = {
             "name": "axpybatched", "num_runs": 40,
             "title": "n=512K",
             "x_label": "batch size", "x_keys": ["batch_num"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1}
                           for b in utils.powers_of_2(1, 512)],
         }
@@ -92,49 +90,48 @@ AXPYBATCHED = {
 }
 
 GEMV = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
             "name": "gemv", "num_runs": 40,
             "title": "multiples of 256",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}],
         },
         {
             "name": "gemv", "num_runs": 40,
             "title": "multiples of 257",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}],
         },
         {
             "name": "gemv", "num_runs": 20,
             "title": "around 4K",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}],
         },
         {
             "name": "gemv", "num_runs": 40,
             "title": "multiples of 256 rotated",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}],
         },
         {
             "name": "gemv", "num_runs": 40,
             "title": "multiples of 257 rotated",
             "x_label": "sizes (n=m)", "x_keys": ["n"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}],
         },
         {
             "name": "gemv", "num_runs": 20,
             "title": "strides n=m=4K",
             "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"],
-            "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"],
+            "y_label": "GB/s (higher is better)", "y_key": "GBs",
             "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1}
                           for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]],
         }
@@ -142,14 +139,13 @@ GEMV = {
 }
 
 GEMM = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
             "name": "gemm", "num_runs": 20,
             "title": "multiples of 128",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
                            "transA": 111, "transB": 111, "step": 128, "num_steps": 20}],
         },
@@ -157,7 +153,7 @@ GEMM = {
             "name": "gemm", "num_runs": 20,
             "title": "multiples of 129",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102,
                            "transA": 111, "transB": 111, "step": 129, "num_steps": 20}],
         },
@@ -165,7 +161,7 @@ GEMM = {
             "name": "gemm", "num_runs": 20,
             "title": "around 512",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102,
                            "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
         },
@@ -173,7 +169,7 @@ GEMM = {
             "name": "gemm", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102,
                            "transA": 111, "transB": 111, "step": 1, "num_steps": 16}],
         },
@@ -181,7 +177,7 @@ GEMM = {
             "name": "gemm", "num_runs": 10,
             "title": "layouts/transpose",
             "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout,
                            "transA": transA, "transB": transB, "step": 0, "num_steps": 1}
                           for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]],
@@ -190,7 +186,7 @@ GEMM = {
             "name": "gemm", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": n, "n": n, "k": n, "layout": 102,
                            "transA": 111, "transB": 111, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(8, utils.k(4))],
@@ -199,14 +195,13 @@ GEMM = {
 }
 
 GEMM_SMALL = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 1,
     "benchmarks": [
         {
             "name": "gemm", "num_runs": 10,
             "title": "small matrices in steps of 16",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
                            "transA": 111, "transB": 111, "step": 16, "num_steps": 57}],
         },
@@ -214,7 +209,7 @@ GEMM_SMALL = {
             "name": "gemm", "num_runs": 10,
             "title": "small matrices in steps of 1",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102,
                            "transA": 111, "transB": 111, "step": 1, "num_steps": 385}],
         },
@@ -223,14 +218,13 @@ GEMM_SMALL = {
 }
 
 GEMMBATCHED = {
-    "label_names": ["CLBlast", "clBLAS (non batched)"],
     "num_rows": 1, "num_cols": 3,
     "benchmarks": [
         {
             "name": "gemmbatched", "num_runs": 40,
             "title": "8 GEMMs",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102,
                            "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
         },
@@ -238,7 +232,7 @@ GEMMBATCHED = {
             "name": "gemmbatched", "num_runs": 20,
             "title": "64 GEMMs",
             "x_label": "sizes (m=n=k)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102,
                            "transA": 111, "transB": 111, "step": 32, "num_steps": 20}],
         },
@@ -246,7 +240,7 @@ GEMMBATCHED = {
             "name": "gemmbatched", "num_runs": 30,
             "title": "m=n=k=128",
             "x_label": "batch size", "x_keys": ["batch_num"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102,
                            "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(16))],
         }
@@ -254,14 +248,13 @@ GEMMBATCHED = {
 }
 
 SYMM = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
             "name": "symm", "num_runs": 10,
             "title": "multiples of 128",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 128, "n": 128, "layout": 102,
                            "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
         },
@@ -269,7 +262,7 @@ SYMM = {
             "name": "symm", "num_runs": 10,
             "title": "multiples of 129",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 129, "n": 129, "layout": 102,
                            "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
         },
@@ -277,7 +270,7 @@ SYMM = {
             "name": "symm", "num_runs": 10,
             "title": "around 512",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 512, "n": 512, "layout": 102,
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
@@ -285,7 +278,7 @@ SYMM = {
             "name": "symm", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 2048, "n": 2048, "layout": 102,
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
@@ -293,7 +286,7 @@ SYMM = {
             "name": "symm", "num_runs": 10,
             "title": "layouts/sides/triangles",
             "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": 1024, "n": 1024, "layout": layout,
                            "side": side, "triangle": triangle, "step": 0, "num_steps": 1}
                           for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]],
@@ -302,7 +295,7 @@ SYMM = {
             "name": "symm", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (m=n)", "x_keys": ["m"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"m": n, "n": n, "layout": 102,
                            "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(8, utils.k(4))],
@@ -311,14 +304,13 @@ SYMM = {
 }
 
 SYRK = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 2, "num_cols": 3,
     "benchmarks": [
         {
             "name": "syrk", "num_runs": 10,
             "title": "multiples of 128",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": 128, "k": 128, "layout": 102,
                            "side": 141, "triangle": 121, "step": 128, "num_steps": 20}],
         },
@@ -326,7 +318,7 @@ SYRK = {
             "name": "syrk", "num_runs": 10,
             "title": "multiples of 129",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": 129, "k": 129, "layout": 102,
                            "side": 141, "triangle": 121, "step": 129, "num_steps": 20}],
         },
@@ -334,7 +326,7 @@ SYRK = {
             "name": "syrk", "num_runs": 10,
             "title": "around 512",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": 512, "k": 512, "layout": 102,
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
@@ -342,7 +334,7 @@ SYRK = {
             "name": "syrk", "num_runs": 10,
             "title": "around 2048",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": 2048, "k": 2048, "layout": 102,
                            "side": 141, "triangle": 121, "step": 1, "num_steps": 16}],
         },
@@ -350,7 +342,7 @@ SYRK = {
             "name": "syrk", "num_runs": 10,
             "title": "layouts/sides/triangles",
             "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": 1024, "k": 1024, "layout": layout,
                            "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1}
                           for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]],
@@ -359,7 +351,7 @@ SYRK = {
             "name": "syrk", "num_runs": 10,
             "title": "powers of 2",
             "x_label": "sizes (n=k)", "x_keys": ["n"],
-            "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"],
+            "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS",
             "arguments": [{"n": n, "k": n, "layout": 102,
                            "side": 141, "triangle": 121, "step": 0, "num_steps": 1}
                           for n in utils.powers_of_2(8, utils.k(4))],
@@ -368,7 +360,6 @@ SYRK = {
 }
 
 SUMMARY = {
-    "label_names": ["CLBlast", "clBLAS"],
     "num_rows": 3, "num_cols": 2,
     "benchmarks": [
         AXPY["benchmarks"][0],
diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
index 040fcc83..c1f96bd7 100644
--- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl
@@ -83,7 +83,7 @@ R"(
 
 // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix
 __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1)))
-void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld,
+void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld,
                          __global real* restrict dest, const int outer_block_size,
                          const int unit_diagonal, const int is_upper)
 {
@@ -91,29 +91,38 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
   const int block_index = get_group_id(0);
 
   // Sets the offset for this particular block in the source and destination matrices
+  const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE;
   const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset;
   const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE;
-  const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block,
-                                (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+  const int block_index_div = block_index / num_inner_blocks;
+  const int block_index_mod = block_index % num_inner_blocks;
+  const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block
+  const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that
+  const int dest_block_offset = offset_part1 + offset_part2;
 
   // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE
   __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
 
   // Loads the source lower triangle into local memory. Any values in the upper triangle or
   // outside of the matrix are set to zero
-  #pragma unroll
   for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) {
-    const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) :
-                                        (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n);
+    bool condition = false;
+    if (is_upper) {
+      condition = (thread_index <= _j) && (block_index_per_block + _j < n);
+    }
+    else {
+      condition = (thread_index >= _j) && (block_index_per_block + thread_index < n);
+    }
     if (condition) {
-      lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset];
+      const int src_index = _j*src_ld + thread_index + src_block_offset;
+      lm[thread_index][_j] = src[src_index];
     }
     else {
       SetToZero(lm[thread_index][_j]);
     }
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-  
+
   // Inverts the diagonal
   real inverted_diagonal;
   SetToOne(inverted_diagonal);