diff options
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | scripts/benchmark/benchmark.py | 44 | ||||
-rw-r--r-- | scripts/benchmark/benchmark_all.py | 7 | ||||
-rw-r--r-- | scripts/benchmark/settings.py | 85 | ||||
-rw-r--r-- | src/kernels/level3/invert_diagonal_blocks_part1.opencl | 25 |
6 files changed, 95 insertions, 68 deletions
@@ -5,6 +5,7 @@ Development (next version) - Added OpenCL pre-processor to unroll loops and perform array-to-register promotions for compilers which don't do this themselves (ARM Mali) - greatly improves performance on these platforms - Added first tuners for the TRSV (block size) and TRSM (invert kernel) routines +- Fixed an issue with a crashing/hanging AMD APP compiler with the TRSM routine (invert kernel) - Improved compilation time by splitting the tuning database into multiple compilation units - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) @@ -7,6 +7,7 @@ CLBlast: The tuned OpenCL BLAS library | Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/) | | Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/) | | OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/) | N/A | N/A | + (*Note*: Automated correctness tests currently not running, servers are offline) CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news. diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py index 3239d385..d0a9d80f 100644 --- a/scripts/benchmark/benchmark.py +++ b/scripts/benchmark/benchmark.py @@ -27,8 +27,12 @@ EXPERIMENTS = { "summary": settings.SUMMARY, } +COMPARISONS = ["clBLAS", "CPU-BLAS", "cuBLAS"] +COMPARISON_ARGS = ["-clblas", "-cblas", "-cublas"] +COMPARISON_IDS = [2, 3, 4] -def run_benchmark(name, arguments_list, precision, num_runs, platform, device): + +def run_benchmark(name, arguments_list, precision, num_runs, platform, device, comparisons): binary = "./clblast_client_x" + name # Loops over sub-benchmarks per benchmark @@ -36,10 +40,16 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device): for arguments in arguments_list: # Sets the arguments - constant_arguments = ["-warm_up", "-q", "-no_abbrv", "-cblas 0", "-cublas 0"] + constant_arguments = ["-warm_up", "-q", "-no_abbrv"] common_arguments = ["-precision %d" % precision, "-runs %d" % num_runs] opencl_arguments = ["-platform %d" % platform, "-device %d" % device] - all_arguments = opencl_arguments + common_arguments + constant_arguments + comparison_arguments = [] + for name, arg in zip(COMPARISONS, COMPARISON_ARGS): + if name in comparisons: + comparison_arguments.append(arg + " 1") + else: + comparison_arguments.append(arg + " 0") + all_arguments = opencl_arguments + common_arguments + constant_arguments + comparison_arguments for name, value in arguments.items(): all_arguments.append("-" + name + " " + str(value)) @@ -54,9 +64,11 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device): result_extra = utils.parse_results(benchmark_output) for index in range(len(min(result, result_extra))): result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"] - result[index]["GBs_2"] = result_extra[index]["GBs_2"] result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"] - result[index]["GFLOPS_2"] = result_extra[index]["GFLOPS_2"] + for id in COMPARISON_IDS: + if "GBs_%d" % id in result_extra[index].keys(): + result[index]["GBs_%d" % id] = result_extra[index]["GBs_%d" % id] + result[index]["GFLOPS_%d" % id] = result_extra[index]["GFLOPS_%d" % id] results.extend(result) return results @@ -65,6 +77,7 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device): def parse_arguments(argv): parser = argparse.ArgumentParser(description="Runs a full benchmark for a specific routine on a specific device") parser.add_argument("-b", "--benchmark", required=True, help="The benchmark to perform (choose from %s)" % sorted(EXPERIMENTS.keys())) + parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS) parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on") parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on") parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging") @@ -78,7 +91,7 @@ def parse_arguments(argv): return vars(cl_args) -def benchmark_single(benchmark, platform, device, num_runs, precision, load_from_disk, +def benchmark_single(benchmark, comparisons, platform, device, num_runs, precision, load_from_disk, plot_title, tight_plot, output_folder, verbose): # Sanity check @@ -91,6 +104,14 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from if benchmark.upper() != "SUMMARY": plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title + # Retrieves the comparison settings + library_ids = [1] + for comparison in comparisons: + if comparison not in COMPARISONS: + print("[benchmark] Invalid comparison library '%s', choose from %s" % (comparison, COMPARISONS)) + return + library_ids.append(COMPARISON_IDS[COMPARISONS.index(comparison)]) + # Retrieves the benchmark settings if benchmark not in EXPERIMENTS.keys(): print("[benchmark] Invalid benchmark '%s', choose from %s" % (benchmark, EXPERIMENTS.keys())) @@ -109,13 +130,13 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from # Runs all the individual benchmarks print("[benchmark] Running on platform %d, device %d" % (platform, device)) print("[benchmark] Running %d benchmarks for settings '%s'" % (len(benchmarks), benchmark)) - results = {"label_names": experiment["label_names"], "num_rows": experiment["num_rows"], + results = {"label_names": ["CLBlast"] + comparisons, "num_rows": experiment["num_rows"], "num_cols": experiment["num_cols"], "benchmarks": []} for bench in benchmarks: num_runs_benchmark = bench["num_runs"] if num_runs is None else num_runs print("[benchmark] Running benchmark '%s:%s'" % (bench["name"], bench["title"])) result = run_benchmark(bench["name"], bench["arguments"], precision, num_runs_benchmark, - platform, device) + platform, device, comparisons) results["benchmarks"].append(result) # Stores the results to disk @@ -128,14 +149,17 @@ def benchmark_single(benchmark, platform, device, num_runs, precision, load_from pdf_file_name = os.path.join(output_folder, benchmark_name.lower() + "_plot" + file_name_suffix + ".pdf") titles = [utils.precision_to_letter(precision) + b["name"].upper() + " " + b["title"] for b in benchmarks] x_keys = [b["x_keys"] for b in benchmarks] - y_keys = [b["y_keys"] for b in benchmarks] + y_keys = [["%s_%d" % (b["y_key"], i) for i in library_ids] for b in benchmarks] x_labels = [b["x_label"] for b in benchmarks] y_labels = [b["y_label"] for b in benchmarks] label_names = results["label_names"] # For half-precision: also adds single-precision results for comparison if precision == 16: - label_names = ["CLBlast FP16", "clBLAS FP32", "CLBlast FP32"] + label_names[0] += " FP16" + for index in range(1, len(label_names)): + label_names[index] += " FP32" + label_names.append("CLBlast FP32") y_keys = [y_key + [y_key[0] + "_FP32"] for y_key in y_keys] # Plots the graphs diff --git a/scripts/benchmark/benchmark_all.py b/scripts/benchmark/benchmark_all.py index 9bf09190..2a7f6c9a 100644 --- a/scripts/benchmark/benchmark_all.py +++ b/scripts/benchmark/benchmark_all.py @@ -10,7 +10,7 @@ import argparse import os import sys -from benchmark import benchmark_single +from benchmark import benchmark_single, COMPARISONS BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"] @@ -18,6 +18,7 @@ BENCHMARKS = ["axpy", "gemv", "gemm", "summary", "axpybatched", "gemmbatched"] def parse_arguments(argv): parser = argparse.ArgumentParser(description="Runs all (main) benchmarks in one go for a given device") + parser.add_argument("-c", "--comparisons", default=[], nargs='+', help="The library(s) to compare against (choose from %s)" % COMPARISONS) parser.add_argument("-p", "--platform", required=True, type=int, help="The ID of the OpenCL platform to test on") parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on") parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464") @@ -29,12 +30,12 @@ def parse_arguments(argv): return vars(cl_args) -def benchmark_all(platform, device, precision, load_from_disk, +def benchmark_all(comparisons, platform, device, precision, load_from_disk, plot_title, output_folder, verbose): for bench in BENCHMARKS: from_disk = load_from_disk for tight_plot in [True, False]: # two plots for a single benchmark - benchmark_single(bench, platform, device, None, precision, from_disk, + benchmark_single(bench, comparisons, platform, device, None, precision, from_disk, plot_title, tight_plot, output_folder, verbose) from_disk = True # for the next plot of the same data diff --git a/scripts/benchmark/settings.py b/scripts/benchmark/settings.py index dae1854f..d0d17178 100644 --- a/scripts/benchmark/settings.py +++ b/scripts/benchmark/settings.py @@ -10,42 +10,41 @@ import utils AXPY = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "axpy", "num_runs": 40, "title": "multiples of 256K", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.k(256), "incx": 1, "incy": 1, "step": utils.k(256), "num_steps": 16}], }, { "name": "axpy", "num_runs": 40, "title": "multiples of 256K+1", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.k(256) + 1, "incx": 1, "incy": 1, "step": utils.k(256) + 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 40, "title": "around 1M", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(1), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 20, "title": "around 16M", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(16), "incx": 1, "incy": 1, "step": 1, "num_steps": 16}], }, { "name": "axpy", "num_runs": 20, "title": "strides n=8M", "x_label": "increments for x,y", "x_keys": ["incx", "incy"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": utils.m(8), "incx": inc_x, "incy": inc_y, "step": 0, "num_steps": 1} for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]], }, @@ -53,7 +52,7 @@ AXPY = { "name": "axpy", "num_runs": 40, "title": "powers of 2", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(32), utils.m(64))], } @@ -61,14 +60,13 @@ AXPY = { } AXPYBATCHED = { - "label_names": ["CLBlast", "clBLAS (non batched)"], "num_rows": 1, "num_cols": 3, "benchmarks": [ { "name": "axpybatched", "num_runs": 30, "title": "8 AXPYs", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": 8, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(8), utils.m(4))], }, @@ -76,7 +74,7 @@ AXPYBATCHED = { "name": "axpybatched", "num_runs": 20, "title": "64 AXPYs", "x_label": "sizes (n)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": 64, "n": n, "incx": 1, "incy": 1, "step": 0, "num_steps": 1} for n in utils.powers_of_2(utils.k(8), utils.m(4))], }, @@ -84,7 +82,7 @@ AXPYBATCHED = { "name": "axpybatched", "num_runs": 40, "title": "n=512K", "x_label": "batch size", "x_keys": ["batch_num"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"batch_num": b, "n": utils.k(512), "incx": 1, "incy": 1, "step": 1, "num_steps": 1} for b in utils.powers_of_2(1, 512)], } @@ -92,49 +90,48 @@ AXPYBATCHED = { } GEMV = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "gemv", "num_runs": 40, "title": "multiples of 256", "x_label": "sizes (n=m)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 102, "step": 256, "num_steps": 20}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 257", "x_label": "sizes (n=m)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 102, "step": 257, "num_steps": 20}], }, { "name": "gemv", "num_runs": 20, "title": "around 4K", "x_label": "sizes (n=m)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 4096, "m": 4096, "incx": 1, "incy": 1, "layout": 102, "step": 1, "num_steps": 16}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 256 rotated", "x_label": "sizes (n=m)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 256, "m": 256, "incx": 1, "incy": 1, "layout": 101, "step": 256, "num_steps": 20}], }, { "name": "gemv", "num_runs": 40, "title": "multiples of 257 rotated", "x_label": "sizes (n=m)", "x_keys": ["n"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 257, "m": 257, "incx": 1, "incy": 1, "layout": 101, "step": 257, "num_steps": 20}], }, { "name": "gemv", "num_runs": 20, "title": "strides n=m=4K", "x_label": "increments/strides for x,y", "x_keys": ["incx", "incy"], - "y_label": "GB/s (higher is better)", "y_keys": ["GBs_1", "GBs_2"], + "y_label": "GB/s (higher is better)", "y_key": "GBs", "arguments": [{"n": 4096, "m": 4096, "incx": inc_x, "incy": inc_y, "layout": 102, "step": 0, "num_steps": 1} for inc_x in [1, 2, 4] for inc_y in [1, 2, 4]], } @@ -142,14 +139,13 @@ GEMV = { } GEMM = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "gemm", "num_runs": 20, "title": "multiples of 128", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 128, "num_steps": 20}], }, @@ -157,7 +153,7 @@ GEMM = { "name": "gemm", "num_runs": 20, "title": "multiples of 129", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 129, "n": 129, "k": 129, "layout": 102, "transA": 111, "transB": 111, "step": 129, "num_steps": 20}], }, @@ -165,7 +161,7 @@ GEMM = { "name": "gemm", "num_runs": 20, "title": "around 512", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 512, "n": 512, "k": 512, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 16}], }, @@ -173,7 +169,7 @@ GEMM = { "name": "gemm", "num_runs": 10, "title": "around 2048", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 2048, "n": 2048, "k": 2048, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 16}], }, @@ -181,7 +177,7 @@ GEMM = { "name": "gemm", "num_runs": 10, "title": "layouts/transpose", "x_label": "layout, transA, transB", "x_keys": ["layout", "transA", "transB"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 1024, "n": 1024, "k": 1024, "layout": layout, "transA": transA, "transB": transB, "step": 0, "num_steps": 1} for layout in [101, 102] for transA in [111, 112] for transB in [111, 112]], @@ -190,7 +186,7 @@ GEMM = { "name": "gemm", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": n, "n": n, "k": n, "layout": 102, "transA": 111, "transB": 111, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], @@ -199,14 +195,13 @@ GEMM = { } GEMM_SMALL = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 1, "benchmarks": [ { "name": "gemm", "num_runs": 10, "title": "small matrices in steps of 16", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 16, "num_steps": 57}], }, @@ -214,7 +209,7 @@ GEMM_SMALL = { "name": "gemm", "num_runs": 10, "title": "small matrices in steps of 1", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111, "step": 1, "num_steps": 385}], }, @@ -223,14 +218,13 @@ GEMM_SMALL = { } GEMMBATCHED = { - "label_names": ["CLBlast", "clBLAS (non batched)"], "num_rows": 1, "num_cols": 3, "benchmarks": [ { "name": "gemmbatched", "num_runs": 40, "title": "8 GEMMs", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 8, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, @@ -238,7 +232,7 @@ GEMMBATCHED = { "name": "gemmbatched", "num_runs": 20, "title": "64 GEMMs", "x_label": "sizes (m=n=k)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": 64, "m": 32, "n": 32, "k": 32, "layout": 102, "transA": 111, "transB": 111, "step": 32, "num_steps": 20}], }, @@ -246,7 +240,7 @@ GEMMBATCHED = { "name": "gemmbatched", "num_runs": 30, "title": "m=n=k=128", "x_label": "batch size", "x_keys": ["batch_num"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"batch_num": b, "m": 128, "n": 128, "k": 128, "layout": 102, "transA": 111, "transB": 111} for b in utils.powers_of_2(1, utils.k(16))], } @@ -254,14 +248,13 @@ GEMMBATCHED = { } SYMM = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "symm", "num_runs": 10, "title": "multiples of 128", "x_label": "sizes (m=n)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 128, "n": 128, "layout": 102, "side": 141, "triangle": 121, "step": 128, "num_steps": 20}], }, @@ -269,7 +262,7 @@ SYMM = { "name": "symm", "num_runs": 10, "title": "multiples of 129", "x_label": "sizes (m=n)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 129, "n": 129, "layout": 102, "side": 141, "triangle": 121, "step": 129, "num_steps": 20}], }, @@ -277,7 +270,7 @@ SYMM = { "name": "symm", "num_runs": 10, "title": "around 512", "x_label": "sizes (m=n)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 512, "n": 512, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, @@ -285,7 +278,7 @@ SYMM = { "name": "symm", "num_runs": 10, "title": "around 2048", "x_label": "sizes (m=n)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 2048, "n": 2048, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, @@ -293,7 +286,7 @@ SYMM = { "name": "symm", "num_runs": 10, "title": "layouts/sides/triangles", "x_label": "layout, side, triangle", "x_keys": ["layout", "side", "triangle"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": 1024, "n": 1024, "layout": layout, "side": side, "triangle": triangle, "step": 0, "num_steps": 1} for layout in [101, 102] for side in [141, 142] for triangle in [121, 122]], @@ -302,7 +295,7 @@ SYMM = { "name": "symm", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (m=n)", "x_keys": ["m"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"m": n, "n": n, "layout": 102, "side": 141, "triangle": 121, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], @@ -311,14 +304,13 @@ SYMM = { } SYRK = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 2, "num_cols": 3, "benchmarks": [ { "name": "syrk", "num_runs": 10, "title": "multiples of 128", "x_label": "sizes (n=k)", "x_keys": ["n"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 128, "k": 128, "layout": 102, "side": 141, "triangle": 121, "step": 128, "num_steps": 20}], }, @@ -326,7 +318,7 @@ SYRK = { "name": "syrk", "num_runs": 10, "title": "multiples of 129", "x_label": "sizes (n=k)", "x_keys": ["n"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 129, "k": 129, "layout": 102, "side": 141, "triangle": 121, "step": 129, "num_steps": 20}], }, @@ -334,7 +326,7 @@ SYRK = { "name": "syrk", "num_runs": 10, "title": "around 512", "x_label": "sizes (n=k)", "x_keys": ["n"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 512, "k": 512, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, @@ -342,7 +334,7 @@ SYRK = { "name": "syrk", "num_runs": 10, "title": "around 2048", "x_label": "sizes (n=k)", "x_keys": ["n"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 2048, "k": 2048, "layout": 102, "side": 141, "triangle": 121, "step": 1, "num_steps": 16}], }, @@ -350,7 +342,7 @@ SYRK = { "name": "syrk", "num_runs": 10, "title": "layouts/sides/triangles", "x_label": "layout, triangle, transA", "x_keys": ["layout", "triangle", "transA"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": 1024, "k": 1024, "layout": layout, "triangle": triangle, "transA": transA, "step": 0, "num_steps": 1} for layout in [101, 102] for triangle in [121, 122] for transA in [111, 112]], @@ -359,7 +351,7 @@ SYRK = { "name": "syrk", "num_runs": 10, "title": "powers of 2", "x_label": "sizes (n=k)", "x_keys": ["n"], - "y_label": "GFLOPS (higher is better)", "y_keys": ["GFLOPS_1", "GFLOPS_2"], + "y_label": "GFLOPS (higher is better)", "y_key": "GFLOPS", "arguments": [{"n": n, "k": n, "layout": 102, "side": 141, "triangle": 121, "step": 0, "num_steps": 1} for n in utils.powers_of_2(8, utils.k(4))], @@ -368,7 +360,6 @@ SYRK = { } SUMMARY = { - "label_names": ["CLBlast", "clBLAS"], "num_rows": 3, "num_cols": 2, "benchmarks": [ AXPY["benchmarks"][0], diff --git a/src/kernels/level3/invert_diagonal_blocks_part1.opencl b/src/kernels/level3/invert_diagonal_blocks_part1.opencl index 040fcc83..c1f96bd7 100644 --- a/src/kernels/level3/invert_diagonal_blocks_part1.opencl +++ b/src/kernels/level3/invert_diagonal_blocks_part1.opencl @@ -83,7 +83,7 @@ R"( // Inverts a diagonal block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE elements in a larger matrix __kernel __attribute__((reqd_work_group_size(INTERNAL_BLOCK_SIZE, 1, 1))) -void InvertDiagonalBlock(int n, __global const real* restrict src, const int src_offset, const int src_ld, +void InvertDiagonalBlock(const int n, __global const real* restrict src, const int src_offset, const int src_ld, __global real* restrict dest, const int outer_block_size, const int unit_diagonal, const int is_upper) { @@ -91,29 +91,38 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src const int block_index = get_group_id(0); // Sets the offset for this particular block in the source and destination matrices + const int block_index_per_block = block_index * INTERNAL_BLOCK_SIZE; const int src_block_offset = block_index * (INTERNAL_BLOCK_SIZE + src_ld * INTERNAL_BLOCK_SIZE) + src_offset; const int num_inner_blocks = outer_block_size / INTERNAL_BLOCK_SIZE; - const int dest_block_offset = (block_index / num_inner_blocks) * outer_block_size * outer_block_size + // go to the (block_index / num_inner_blocks) outer outer_block_size*outer_block_size block, - (block_index % num_inner_blocks) * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the (block_index % num_inner_blocks) inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + const int block_index_div = block_index / num_inner_blocks; + const int block_index_mod = block_index % num_inner_blocks; + const int offset_part1 = block_index_div * outer_block_size * outer_block_size; // go to the block_index_div outer outer_block_size*outer_block_size block + const int offset_part2 = block_index_mod * (outer_block_size*INTERNAL_BLOCK_SIZE + INTERNAL_BLOCK_SIZE); // then to the block_index_mod inner INTERNAL_BLOCK_SIZE*INTERNAL_BLOCK_SIZE block inside that + const int dest_block_offset = offset_part1 + offset_part2; // Local memory to store the inverted block of INTERNAL_BLOCK_SIZE by INTERNAL_BLOCK_SIZE __local real lm[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; // Loads the source lower triangle into local memory. Any values in the upper triangle or // outside of the matrix are set to zero - #pragma unroll for (int _j = 0; _j < INTERNAL_BLOCK_SIZE; _j += 1) { - const bool condition = (is_upper) ? (thread_index <= _j && block_index*INTERNAL_BLOCK_SIZE + _j < n) : - (thread_index >= _j && block_index*INTERNAL_BLOCK_SIZE + thread_index < n); + bool condition = false; + if (is_upper) { + condition = (thread_index <= _j) && (block_index_per_block + _j < n); + } + else { + condition = (thread_index >= _j) && (block_index_per_block + thread_index < n); + } if (condition) { - lm[thread_index][_j] = src[_j*src_ld + thread_index + src_block_offset]; + const int src_index = _j*src_ld + thread_index + src_block_offset; + lm[thread_index][_j] = src[src_index]; } else { SetToZero(lm[thread_index][_j]); } } barrier(CLK_LOCAL_MEM_FENCE); - + // Inverts the diagonal real inverted_diagonal; SetToOne(inverted_diagonal); |