diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-11-02 21:47:14 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-11-02 21:47:14 +0100 |
commit | 9b0a435fb00b845b875590be90acffcd4f3bb009 (patch) | |
tree | 754b523789ef717619b540925c97e7167ba28f06 | |
parent | 73272ab97dbd5abe757f6558c9b89665c5ac99d0 (diff) |
Integrated the GEMM routine tuner for kernel selection; added first tuning results
-rw-r--r-- | CHANGELOG | 1 | ||||
-rw-r--r-- | CMakeLists.txt | 6 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | scripts/database/database/clblast.py | 3 | ||||
-rw-r--r-- | src/database/database.cpp | 5 | ||||
-rw-r--r-- | src/database/kernel_selection.hpp | 136 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine.hpp | 14 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_16.hpp | 26 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_32.hpp | 34 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_3232.hpp | 34 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_64.hpp | 26 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_6464.hpp | 26 | ||||
-rw-r--r-- | src/routine.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 6 | ||||
-rw-r--r-- | src/routines/levelx/xgemmbatched.cpp | 2 | ||||
-rw-r--r-- | src/tuning/routines/xgemm.cpp | 13 | ||||
-rw-r--r-- | src/utilities/timing.hpp | 9 | ||||
-rw-r--r-- | test/routines/level3/xgemm.hpp | 2 |
18 files changed, 193 insertions, 154 deletions
@@ -8,6 +8,7 @@ Development (next version) * All correctness tests and performance clients work on CUDA like they did for OpenCL - Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters' - Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers +- GEMM kernel selection (direct vs in-direct) is now done automatically using a new tuner - Various minor fixes and enhancements - Added tuned parameters for various devices (see README) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73b47637..a982d87d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,7 +192,9 @@ endif() set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemm_direct xgemv) set(DATABASES copy pad padtranspose transpose xaxpy xdot - xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger) + xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger + gemm_routine) +set(ROUTINE_TUNERS xgemm) set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax) set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv xtrsv xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2) @@ -231,7 +233,6 @@ set(HEADERS # such that they can be discovered by IDEs such as CLion and Visual src/database/apple_cpu_fallback.hpp src/database/database.hpp src/database/database_structure.hpp - src/database/kernel_selection.hpp src/routines/level1/xamin.hpp src/routines/level1/xmax.hpp src/routines/level1/xmin.hpp @@ -377,7 +378,6 @@ if(TUNERS) target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS}) install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin) endforeach() - set(ROUTINE_TUNERS xgemm) foreach(ROUTINE_TUNER ${ROUTINE_TUNERS}) add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp) target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast ${CLTUNE_LIBRARIES} ${API_LIBRARIES}) @@ -196,6 +196,8 @@ In summary, tuning the entire library for your device can be done as follows (st Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. See the [API documentation](doc/clblast.md#overrideparameters-override-tuning-parameters-auxiliary-function) for more details. +After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel. + Compiling the correctness tests (optional) ------------- diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py index 428bfdda..2b4f734c 100644 --- a/scripts/database/database/clblast.py +++ b/scripts/database/database/clblast.py @@ -23,7 +23,8 @@ DEVICE_TYPE_ATTRIBUTES = ["clblast_device_vendor", "clblast_device_type"] DEVICE_ATTRIBUTES = ["clblast_device_name", "clblast_device_architecture", "device_core_clock", "device_compute_units"] KERNEL_ATTRIBUTES = ["precision", "kernel_family"] -ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"] +ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta", + "arg_from", "arg_to", "arg_step"] ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES diff --git a/src/database/database.cpp b/src/database/database.cpp index 836c8803..2fa86151 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -30,10 +30,11 @@ #include "database/kernels/transpose/transpose.hpp" #include "database/kernels/padtranspose/padtranspose.hpp" +#include "database/kernels/gemm_routine/gemm_routine.hpp" + #include "database/kernels/xtrsv.hpp" #include "database/kernels/invert.hpp" #include "database/apple_cpu_fallback.hpp" -#include "database/kernel_selection.hpp" namespace clblast { // ================================================================================================= @@ -54,7 +55,7 @@ const std::vector<database::DatabaseEntry> Database::database = std::vector<data database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble, database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble, database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble, - database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble + database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble }; const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{ database::XaxpyApple, database::XdotApple, diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp deleted file mode 100644 index 6d74b9f9..00000000 --- a/src/database/kernel_selection.hpp +++ /dev/null @@ -1,136 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel -// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the -// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use -// some common default values. -// -// ================================================================================================= - -namespace clblast { -namespace database { -// ================================================================================================= - -const DatabaseEntry KernelSelectionHalf = { - "KernelSelection", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionSingle = { - "KernelSelection", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { - kDeviceTypeGPU, "ARM", { - { "default", { { kDeviceNameDefault, Params{ 128*128*128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionComplexSingle = { - "KernelSelection", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionDouble = { - "KernelSelection", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionComplexDouble = { - "KernelSelection", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace database -} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine.hpp b/src/database/kernels/gemm_routine/gemm_routine.hpp new file mode 100644 index 00000000..f1470252 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine.hpp @@ -0,0 +1,14 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels. +// +// ================================================================================================= + +#include "database/kernels/gemm_routine/gemm_routine_16.hpp" +#include "database/kernels/gemm_routine/gemm_routine_32.hpp" +#include "database/kernels/gemm_routine/gemm_routine_3232.hpp" +#include "database/kernels/gemm_routine/gemm_routine_64.hpp" +#include "database/kernels/gemm_routine/gemm_routine_6464.hpp" diff --git a/src/database/kernels/gemm_routine/gemm_routine_16.hpp b/src/database/kernels/gemm_routine/gemm_routine_16.hpp new file mode 100644 index 00000000..e17afe4b --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_16.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineHalf = { + "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp new file mode 100644 index 00000000..624de564 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineSingle = { + "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { + { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp new file mode 100644 index 00000000..689ae8d8 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineComplexSingle = { + "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { + { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp new file mode 100644 index 00000000..7fd29128 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineDouble = { + "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp new file mode 100644 index 00000000..85d2c8f1 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineComplexDouble = { + "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/routine.cpp b/src/routine.cpp index 0f9fe360..48273eac 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -43,7 +43,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r {"Padtranspose", routines_gemm_syrk}, {"Xgemm", routines_gemm_syrk}, {"XgemmDirect", routines_gemm}, - {"KernelSelection", routines_gemm}, + {"GemmRoutine", routines_gemm}, {"Invert", routines_trsm}, }; // ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index a0063ee2..94392dd0 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -23,7 +23,7 @@ namespace clblast { template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, - {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" @@ -104,7 +104,9 @@ void Xgemm<T>::DoGemm(const Layout layout, // Selects which version of GEMM to run const auto m_n_k = static_cast<unsigned long long>(m) * static_cast<unsigned long long>(n) * static_cast<unsigned long long>(k); - const auto do_gemm_direct = (m_n_k < static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"])); + const auto database_value = static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]); + const auto min_indirect_size = database_value * database_value * database_value; + const auto do_gemm_direct = (m_n_k < min_indirect_size); if (do_gemm_direct) { // for small sizes (single kernel) GemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 8a015e97..152e7194 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -23,7 +23,7 @@ namespace clblast { template <typename T> XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, - {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index 1ccaa0ca..f45e8635 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -42,7 +42,7 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu template <typename T> void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) { - const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(), + const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(), {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}}); if (override_status != StatusCode::kSuccess) { throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); @@ -61,7 +61,7 @@ void TuneXgemm(int argc, char* argv[]) { // Values for m, n, and k const auto from = size_t{64}; - const auto to = size_t{1024}; + const auto to = size_t{2048}; const auto step = size_t{64}; // OpenCL initialisation @@ -106,7 +106,10 @@ void TuneXgemm(int argc, char* argv[]) { scores[i] = TuningResult{ "gemm_kernel_selection", static_cast<double>(score) / static_cast<double>(scores.size() - 1) + epsilon, - TuningParameters{TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}} + TuningParameters{ + TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}, + TuningParameter{"PRECISION", static_cast<size_t>(precision)} + } }; } @@ -126,11 +129,15 @@ void TuneXgemm(int argc, char* argv[]) { const auto precision_string = std::to_string(static_cast<size_t>(precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", "gemm_routine"}, + {"arg_from", ToString(from)}, + {"arg_to", ToString(to)}, + {"arg_step", ToString(step)}, {"precision", precision_string}, }; PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", device, platform, metadata, scores); + printf("[ STATUS ] All done\n"); } // ================================================================================================= diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index 423e6e2b..bfad6147 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -73,16 +73,17 @@ void PrintTimingsToFileAsJSON(const std::string &filename, const Device& device, const Platform& platform, const std::vector<std::pair<std::string,std::string>> &metadata, const std::vector<TuningResult>& tuning_results) { + printf("[ STATUS ] Writing results to '%s'\n", filename.c_str()); auto file = fopen(filename.c_str(), "w"); fprintf(file, "{\n"); for (auto &datum: metadata) { fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); } fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); - fprintf(file, " \"device_name\": \"%s\",\n", GetDeviceName(device).c_str()); - fprintf(file, " \"device_vendor\": \"%s\",\n", platform.Vendor().c_str()); - fprintf(file, " \"device_type\": \"%s\",\n", device.Type().c_str()); - fprintf(file, " \"device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); + fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); + fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str()); + fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str()); + fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); fprintf(file, " \"results\": [\n"); diff --git a/test/routines/level3/xgemm.hpp b/test/routines/level3/xgemm.hpp index 8444c1c3..fe8cf7b9 100644 --- a/test/routines/level3/xgemm.hpp +++ b/test/routines/level3/xgemm.hpp @@ -86,7 +86,7 @@ class TestXgemm { if (V != 0) { const auto device = queue.GetDevice(); const auto switch_threshold = (V == 1) ? size_t{0} : size_t{1024 * 1024 * 1024}; // large enough for tests - const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(), + const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(), {{"XGEMM_MIN_INDIRECT_SIZE", switch_threshold}}); if (override_status != StatusCode::kSuccess) { return override_status; } } |