diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-27 21:03:24 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-27 21:03:24 +0200 |
commit | d59e5c570b0bbdb8348d2f9ee6fc5850e606db27 (patch) | |
tree | 603b7f7631c46ec55ede97fd3205affdbc45a169 | |
parent | db5772e521a03602c8f66be95a4dc4d07b83cd84 (diff) |
Added an option to run tuned kernels multiple times to average execution times; requires CLTune 2.5.0
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 5 |
3 files changed, 6 insertions, 2 deletions
@@ -3,6 +3,7 @@ Development version (next release) - It is now possible to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS - Fixed a bug in the tests and samples related to waiting for an invalid event - Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header +- Added an option to run tuned kernels multiple times to average execution times - Various minor fixes and enhancements Version 0.9.0 @@ -136,7 +136,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s cmake -DTUNERS=ON .. -Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher). +Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher). Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake. diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 13bae5a6..8fa93efc 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -46,6 +46,8 @@ void Tuner(int argc, char* argv[]) { if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); } if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); } } + const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{1}); + fprintf(stdout, "%s\n", help.c_str()); // Tests validity of the given arguments @@ -127,6 +129,7 @@ void Tuner(int argc, char* argv[]) { C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process + tuner.SetNumRuns(num_runs); tuner.Tune(); // Prints the results to screen @@ -135,7 +138,7 @@ void Tuner(int argc, char* argv[]) { // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { - printf("[ -------> ] %.1lf ms", time_ms); + printf("[ -------> ] %.2lf ms", time_ms); printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); } |