diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/database/database.cpp | 5 | ||||
-rw-r--r-- | src/database/kernel_selection.hpp | 136 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine.hpp | 14 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_16.hpp | 26 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_32.hpp | 34 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_3232.hpp | 34 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_64.hpp | 26 | ||||
-rw-r--r-- | src/database/kernels/gemm_routine/gemm_routine_6464.hpp | 26 | ||||
-rw-r--r-- | src/routine.cpp | 2 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 6 | ||||
-rw-r--r-- | src/routines/levelx/xgemmbatched.cpp | 2 | ||||
-rw-r--r-- | src/tuning/routines/xgemm.cpp | 13 | ||||
-rw-r--r-- | src/utilities/timing.hpp | 9 |
13 files changed, 184 insertions, 149 deletions
diff --git a/src/database/database.cpp b/src/database/database.cpp index 836c8803..2fa86151 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -30,10 +30,11 @@ #include "database/kernels/transpose/transpose.hpp" #include "database/kernels/padtranspose/padtranspose.hpp" +#include "database/kernels/gemm_routine/gemm_routine.hpp" + #include "database/kernels/xtrsv.hpp" #include "database/kernels/invert.hpp" #include "database/apple_cpu_fallback.hpp" -#include "database/kernel_selection.hpp" namespace clblast { // ================================================================================================= @@ -54,7 +55,7 @@ const std::vector<database::DatabaseEntry> Database::database = std::vector<data database::TransposeHalf, database::TransposeSingle, database::TransposeDouble, database::TransposeComplexSingle, database::TransposeComplexDouble, database::PadtransposeHalf, database::PadtransposeSingle, database::PadtransposeDouble, database::PadtransposeComplexSingle, database::PadtransposeComplexDouble, database::InvertHalf, database::InvertSingle, database::InvertDouble, database::InvertComplexSingle, database::InvertComplexDouble, - database::KernelSelectionHalf, database::KernelSelectionSingle, database::KernelSelectionDouble, database::KernelSelectionComplexSingle, database::KernelSelectionComplexDouble + database::GemmRoutineHalf, database::GemmRoutineSingle, database::GemmRoutineDouble, database::GemmRoutineComplexSingle, database::GemmRoutineComplexDouble }; const std::vector<database::DatabaseEntry> Database::apple_cpu_fallback = std::vector<database::DatabaseEntry>{ database::XaxpyApple, database::XdotApple, diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp deleted file mode 100644 index 6d74b9f9..00000000 --- a/src/database/kernel_selection.hpp +++ /dev/null @@ -1,136 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren <www.cedricnugteren.nl> -// -// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel -// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the -// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use -// some common default values. -// -// ================================================================================================= - -namespace clblast { -namespace database { -// ================================================================================================= - -const DatabaseEntry KernelSelectionHalf = { - "KernelSelection", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionSingle = { - "KernelSelection", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { - kDeviceTypeGPU, "ARM", { - { "default", { { kDeviceNameDefault, Params{ 128*128*128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionComplexSingle = { - "KernelSelection", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionDouble = { - "KernelSelection", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= - -const DatabaseEntry KernelSelectionComplexDouble = { - "KernelSelection", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { - { // Intel GPUs - kDeviceTypeGPU, "Intel", { - { "default", { { kDeviceNameDefault, Params{ 1*1*1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // NVIDIA GPUs - kDeviceTypeGPU, "NVIDIA", { - { "default", { { kDeviceNameDefault, Params{ 1280*1280*1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - { // Default - kDeviceTypeAll, "default", { - { "default", { { kDeviceNameDefault, Params{ 512*512*512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } } }, - } - }, - } -}; - -// ================================================================================================= -} // namespace database -} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine.hpp b/src/database/kernels/gemm_routine/gemm_routine.hpp new file mode 100644 index 00000000..f1470252 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine.hpp @@ -0,0 +1,14 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine' kernels. +// +// ================================================================================================= + +#include "database/kernels/gemm_routine/gemm_routine_16.hpp" +#include "database/kernels/gemm_routine/gemm_routine_32.hpp" +#include "database/kernels/gemm_routine/gemm_routine_3232.hpp" +#include "database/kernels/gemm_routine/gemm_routine_64.hpp" +#include "database/kernels/gemm_routine/gemm_routine_6464.hpp" diff --git a/src/database/kernels/gemm_routine/gemm_routine_16.hpp b/src/database/kernels/gemm_routine/gemm_routine_16.hpp new file mode 100644 index 00000000..e17afe4b --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_16.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine16' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineHalf = { + "GemmRoutine", Precision::kHalf, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp new file mode 100644 index 00000000..624de564 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine32' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineSingle = { + "GemmRoutine", Precision::kSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { + { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp new file mode 100644 index 00000000..689ae8d8 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp @@ -0,0 +1,34 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine3232' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineComplexSingle = { + "GemmRoutine", Precision::kComplexSingle, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Intel GPUs + kDeviceTypeGPU, "Intel", { + { "default", { + { Name{"Intel(R) HD Graphics Skylake ULT GT2 "}, Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp new file mode 100644 index 00000000..7fd29128 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine64' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineDouble = { + "GemmRoutine", Precision::kDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp new file mode 100644 index 00000000..85d2c8f1 --- /dev/null +++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp @@ -0,0 +1,26 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. It +// is auto-generated by the 'scripts/database/database.py' Python script. +// +// This file populates the database with best-found tuning parameters for the 'Gemm_Routine6464' kernels. +// +// ================================================================================================= + +namespace clblast { +namespace database { + +const DatabaseEntry GemmRoutineComplexDouble = { + "GemmRoutine", Precision::kComplexDouble, {"XGEMM_MIN_INDIRECT_SIZE"}, { + { // Default + kDeviceTypeAll, "default", { + { "default", { + { kDeviceNameDefault , Params{ 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + } }, + } + }, + } +}; + +} // namespace database +} // namespace clblast diff --git a/src/routine.cpp b/src/routine.cpp index 0f9fe360..48273eac 100644 --- a/src/routine.cpp +++ b/src/routine.cpp @@ -43,7 +43,7 @@ const std::unordered_map<std::string, const std::vector<std::string>> Routine::r {"Padtranspose", routines_gemm_syrk}, {"Xgemm", routines_gemm_syrk}, {"XgemmDirect", routines_gemm}, - {"KernelSelection", routines_gemm}, + {"GemmRoutine", routines_gemm}, {"Invert", routines_trsm}, }; // ================================================================================================= diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index a0063ee2..94392dd0 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -23,7 +23,7 @@ namespace clblast { template <typename T> Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, - {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" @@ -104,7 +104,9 @@ void Xgemm<T>::DoGemm(const Layout layout, // Selects which version of GEMM to run const auto m_n_k = static_cast<unsigned long long>(m) * static_cast<unsigned long long>(n) * static_cast<unsigned long long>(k); - const auto do_gemm_direct = (m_n_k < static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"])); + const auto database_value = static_cast<unsigned long long>(db_["XGEMM_MIN_INDIRECT_SIZE"]); + const auto min_indirect_size = database_value * database_value * database_value; + const auto do_gemm_direct = (m_n_k < min_indirect_size); if (do_gemm_direct) { // for small sizes (single kernel) GemmDirect(m, n, k, alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 8a015e97..152e7194 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -23,7 +23,7 @@ namespace clblast { template <typename T> XgemmBatched<T>::XgemmBatched(Queue &queue, EventPointer event, const std::string &name): Routine(queue, event, name, - {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"}, + {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","GemmRoutine"}, PrecisionValue<T>(), {}, { #include "../../kernels/level3/level3.opencl" #include "../../kernels/level3/copy_fast.opencl" diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp index 1ccaa0ca..f45e8635 100644 --- a/src/tuning/routines/xgemm.cpp +++ b/src/tuning/routines/xgemm.cpp @@ -42,7 +42,7 @@ void RunGemmRoutine(const size_t value, const Queue& queue, const std::vector<Bu template <typename T> void ForceSelectIndirectFrom(const size_t minimum_size, const Device &device) { - const auto override_status = OverrideParameters(device(), "KernelSelection", PrecisionValue<T>(), + const auto override_status = OverrideParameters(device(), "GemmRoutine", PrecisionValue<T>(), {{"XGEMM_MIN_INDIRECT_SIZE", minimum_size}}); if (override_status != StatusCode::kSuccess) { throw RuntimeError("OverrideParameters failed with status " + ToString(override_status)); @@ -61,7 +61,7 @@ void TuneXgemm(int argc, char* argv[]) { // Values for m, n, and k const auto from = size_t{64}; - const auto to = size_t{1024}; + const auto to = size_t{2048}; const auto step = size_t{64}; // OpenCL initialisation @@ -106,7 +106,10 @@ void TuneXgemm(int argc, char* argv[]) { scores[i] = TuningResult{ "gemm_kernel_selection", static_cast<double>(score) / static_cast<double>(scores.size() - 1) + epsilon, - TuningParameters{TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}} + TuningParameters{ + TuningParameter{"XGEMM_MIN_INDIRECT_SIZE", indirect[i].first}, + TuningParameter{"PRECISION", static_cast<size_t>(precision)} + } }; } @@ -126,11 +129,15 @@ void TuneXgemm(int argc, char* argv[]) { const auto precision_string = std::to_string(static_cast<size_t>(precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", "gemm_routine"}, + {"arg_from", ToString(from)}, + {"arg_to", ToString(to)}, + {"arg_step", ToString(step)}, {"precision", precision_string}, }; PrintTimingsToFileAsJSON("clblast_routine_gemm_" + precision_string + ".json", device, platform, metadata, scores); + printf("[ STATUS ] All done\n"); } // ================================================================================================= diff --git a/src/utilities/timing.hpp b/src/utilities/timing.hpp index 423e6e2b..bfad6147 100644 --- a/src/utilities/timing.hpp +++ b/src/utilities/timing.hpp @@ -73,16 +73,17 @@ void PrintTimingsToFileAsJSON(const std::string &filename, const Device& device, const Platform& platform, const std::vector<std::pair<std::string,std::string>> &metadata, const std::vector<TuningResult>& tuning_results) { + printf("[ STATUS ] Writing results to '%s'\n", filename.c_str()); auto file = fopen(filename.c_str(), "w"); fprintf(file, "{\n"); for (auto &datum: metadata) { fprintf(file, " \"%s\": \"%s\",\n", datum.first.c_str(), datum.second.c_str()); } fprintf(file, " \"platform_version\": \"%s\",\n", platform.Version().c_str()); - fprintf(file, " \"device_name\": \"%s\",\n", GetDeviceName(device).c_str()); - fprintf(file, " \"device_vendor\": \"%s\",\n", platform.Vendor().c_str()); - fprintf(file, " \"device_type\": \"%s\",\n", device.Type().c_str()); - fprintf(file, " \"device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); + fprintf(file, " \"clblast_device_name\": \"%s\",\n", GetDeviceName(device).c_str()); + fprintf(file, " \"clblast_device_vendor\": \"%s\",\n", platform.Vendor().c_str()); + fprintf(file, " \"clblast_device_type\": \"%s\",\n", device.Type().c_str()); + fprintf(file, " \"clblast_device_architecture\": \"%s\",\n", GetDeviceArchitecture(device).c_str()); fprintf(file, " \"device_core_clock\": \"%zu\",\n", device.CoreClock()); fprintf(file, " \"device_compute_units\": \"%zu\",\n", device.ComputeUnits()); fprintf(file, " \"results\": [\n"); |