summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt8
-rw-r--r--doc/tuning.md4
-rw-r--r--src/tuning/routines/xgemm.cpp18
4 files changed, 28 insertions, 3 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 1d2f3d9d..c86ab70b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,7 @@ Development (next version)
- Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
- Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
- Re-added a local memory size constraint to the tuners
+- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
- Updated and reorganised the CLBlast documentation
- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
- Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0715b866..7f3d258a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -388,6 +388,7 @@ if(TUNERS)
src/utilities/clblast_exceptions.cpp
src/utilities/timing.cpp
src/utilities/utilities.cpp
+ test/test_utilities.cpp
src/tuning/configurations.cpp
src/tuning/tuning.cpp
src/kernel_preprocessor.cpp)
@@ -396,6 +397,7 @@ if(TUNERS)
src/utilities/clblast_exceptions.hpp
src/utilities/timing.hpp
src/utilities/utilities.hpp
+ test/test_utilities.hpp
src/tuning/configurations.hpp
src/tuning/tuning.hpp
src/tuning/routines/routine_tuner.hpp
@@ -439,6 +441,12 @@ if(TUNERS)
endforeach()
set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
endforeach()
+ foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+ foreach(PRECISION ${PRECISIONS})
+ set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
+ endforeach()
+ set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
+ endforeach()
add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
endif()
diff --git a/doc/tuning.md b/doc/tuning.md
index 60ad2cc7..b5186ac6 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -82,7 +82,7 @@ Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clbla
The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line.
-There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
+There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
Using the tuning results
@@ -100,8 +100,6 @@ In summary, tuning the entire library for your device can be done as follows (st
python ../scripts/database/database.py . ..
make
-After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
-
Tuning using the API (advanced users only)
-------------
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 0721ad7c..ceb91f4d 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -15,8 +15,10 @@
#include <exception>
#include <string>
#include <vector>
+#include <iostream>
#include "utilities/utilities.hpp"
+#include "../test/test_utilities.hpp"
#include "tuning/routines/routine_tuner.hpp"
namespace clblast {
@@ -101,6 +103,22 @@ void TuneXgemm(int argc, char* argv[]) {
const auto context = Context(device);
auto queue = Queue(context, device);
+ // Pre-load GEMM kernel tuning results if they exist
+ printf("* The GEMM routine tuner requires already tuned kernels\n");
+ printf(" Applying tuning results from disk if they exist...\n\n");
+ const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"};
+ for (const auto& kernel_name : kernel_names) {
+ const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" +
+ ToString(static_cast<int>(precision)) + ".json";
+ printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str());
+ if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk
+ OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision);
+ }
+ else {
+ printf(" Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name);
+ }
+ }
+
// Run the tuners for the XGEMM routines
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
64, 2048, 64, 1, num_runs,