summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-09-27 21:03:24 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-09-27 21:03:24 +0200
commitd59e5c570b0bbdb8348d2f9ee6fc5850e606db27 (patch)
tree603b7f7631c46ec55ede97fd3205affdbc45a169
parentdb5772e521a03602c8f66be95a4dc4d07b83cd84 (diff)
Added an option to run tuned kernels multiple times to average execution times; requires CLTune 2.5.0
-rw-r--r--CHANGELOG1
-rw-r--r--README.md2
-rw-r--r--src/tuning/tuning.hpp5
3 files changed, 6 insertions, 2 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 6f6732f0..9adb6e64 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,7 @@ Development version (next release)
- It is now possible to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
- Fixed a bug in the tests and samples related to waiting for an invalid event
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
+- Added an option to run tuned kernels multiple times to average execution times
- Various minor fixes and enhancements
Version 0.9.0
diff --git a/README.md b/README.md
index 025052a6..f53b4dda 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
cmake -DTUNERS=ON ..
-Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.5.0 or higher).
Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 13bae5a6..8fa93efc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -46,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
if (o == kArgBeta) { args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
}
+ const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, size_t{1});
+
fprintf(stdout, "%s\n", help.c_str());
// Tests validity of the given arguments
@@ -127,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
// Starts the tuning process
+ tuner.SetNumRuns(num_runs);
tuner.Tune();
// Prints the results to screen
@@ -135,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
// Also prints the performance of the best-case in terms of GB/s or GFLOPS
if (time_ms != 0.0) {
- printf("[ -------> ] %.1lf ms", time_ms);
+ printf("[ -------> ] %.2lf ms", time_ms);
printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
}