diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-10 14:01:21 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-09-10 14:01:21 +0200 |
commit | 94163970ae2e4fec2e86de9640a546fbeee23324 (patch) | |
tree | fd92fb20b3ddee3b24438068d36902e893219248 /src | |
parent | a2f83507033a20b534099c7b21d4a7466108e949 (diff) | |
parent | e21f32bc9928f87a8d0ff15797e2ed2ab65ded58 (diff) |
Merge branch 'xgemm_tuner_exhaustive' into development
Diffstat (limited to 'src')
-rw-r--r-- | src/database/kernels/copy.hpp | 2 | ||||
-rw-r--r-- | src/database/kernels/xaxpy.hpp | 8 | ||||
-rw-r--r-- | src/database/kernels/xgemm.hpp | 2 | ||||
-rw-r--r-- | src/database/kernels/xgemv.hpp | 2 | ||||
-rw-r--r-- | src/database/kernels/xgemv_fast.hpp | 8 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.cpp | 85 |
6 files changed, 71 insertions, 36 deletions
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp index dc2011fd..a6b7dfe8 100644 --- a/src/database/kernels/copy.hpp +++ b/src/database/kernels/copy.hpp @@ -87,7 +87,7 @@ const Database::DatabaseEntry Database::CopySingle = { { "GeForce GTX 750 Ti", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, { "GeForce GTX 980", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } }, { "GeForce GTX TITAN", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } }, - { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } }, + { "GeForce GTX TITAN X", { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } }, { "Tesla K20m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } }, { "Tesla K40m", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } }, { "default", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } }, diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp index 60fa7555..6e84ca5a 100644 --- a/src/database/kernels/xaxpy.hpp +++ b/src/database/kernels/xaxpy.hpp @@ -79,10 +79,10 @@ const Database::DatabaseEntry Database::XaxpySingle = { { // NVIDIA GPUs kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 1070", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 480", { {"VW",4}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",4} } }, + { "GeForce GTX 480", { {"VW",2}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, - { "GeForce GTX 680", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 680", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750 Ti", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, { "GeForce GTX 980", { {"VW",1}, {"WGS",1024}, {"WPT",1} } }, @@ -209,7 +209,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = { kDeviceTypeGPU, "NVIDIA", { { "GRID K520", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 1070", { {"VW",1}, {"WGS",64}, {"WPT",8} } }, - { "GeForce GTX 480", { {"VW",2}, {"WGS",64}, {"WPT",1} } }, + { "GeForce GTX 480", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, { "GeForce GTX 670", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 680", { {"VW",1}, {"WGS",64}, {"WPT",1} } }, { "GeForce GTX 750", { {"VW",1}, {"WGS",128}, {"WPT",1} } }, diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp index c960592d..cc81cf6a 100644 --- a/src/database/kernels/xgemm.hpp +++ b/src/database/kernels/xgemm.hpp @@ -158,7 +158,7 @@ const Database::DatabaseEntry Database::XgemmDouble = { "Xgemm", Precision::kDouble, { { // AMD GPUs kDeviceTypeGPU, "AMD", { - { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } }, + { "AMD Radeon R9 M370X Compute Engine", { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } }, { "Hawaii", { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } }, { "Oland", { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } }, { "Pitcairn", { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } }, diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp index 7e8e64e3..03e84525 100644 --- a/src/database/kernels/xgemv.hpp +++ b/src/database/kernels/xgemv.hpp @@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::XgemvSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"WGS1",64}, {"WPT1",4} } }, + { "default", { {"WGS1",64}, {"WPT1",1} } }, } }, } diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp index f5e3e630..c12fcdca 100644 --- a/src/database/kernels/xgemv_fast.hpp +++ b/src/database/kernels/xgemv_fast.hpp @@ -60,7 +60,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, { "Iris Pro", { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } }, - { "default", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, + { "default", { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } }, } }, { // Intel accelerators @@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, } @@ -123,7 +123,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = { { "Intel(R) HD Graphics Skylake ULT GT2", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Iris", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, { "Iris Pro", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } }, - { "default", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, { // Intel accelerators @@ -145,7 +145,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = { }, { // Default kDeviceTypeAll, "default", { - { "default", { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } }, + { "default", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } }, } }, } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index eb7c8a66..7c9ac76a 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -7,7 +7,9 @@ // Author(s): // Cedric Nugteren <www.cedricnugteren.nl> // -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. +// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations: +// - V==1: This tests some limited set of tuning parameters exhaustively. +// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // // ================================================================================================= @@ -21,12 +23,12 @@ namespace clblast { // ================================================================================================= // See comment at top of file for a description of the class -template <typename T> +template <typename T, int V> class TuneXgemm { public: // The representative kernel and the source code - static std::string KernelFamily() { return "xgemm"; } + static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; } static std::string KernelName() { return "Xgemm"; } static std::string GetSources() { return @@ -48,7 +50,7 @@ class TuneXgemm { static size_t DefaultM() { return 1024; } static size_t DefaultN() { return 1024; } static size_t DefaultK() { return 1024; } - static double DefaultFraction() { return 256.0; } + static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly // Describes how to obtain the sizes of the buffers static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel @@ -60,20 +62,38 @@ class TuneXgemm { // Sets the tuning parameters and their possible values static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2}); - tuner.AddParameter(id, "VWM", {1, 2, 4}); - tuner.AddParameter(id, "VWN", {1, 2, 4}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + tuner.AddParameter(id, "MWG", {16, 32, 64}); + tuner.AddParameter(id, "NWG", {16, 32, 64}); + tuner.AddParameter(id, "KWG", {32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4}); + tuner.AddParameter(id, "VWN", {1, 2, 4}); + tuner.AddParameter(id, "STRM", {0}); + tuner.AddParameter(id, "STRN", {0}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } // a lot more tuning parameters - has to be sampled randomly, too much to test all + else { + tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); + tuner.AddParameter(id, "KWG", {16, 32}); + tuner.AddParameter(id, "MDIMC", {8, 16, 32}); + tuner.AddParameter(id, "NDIMC", {8, 16, 32}); + tuner.AddParameter(id, "MDIMA", {8, 16, 32}); + tuner.AddParameter(id, "NDIMB", {8, 16, 32}); + tuner.AddParameter(id, "KWI", {2}); + tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); + tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); + tuner.AddParameter(id, "STRM", {0, 1}); + tuner.AddParameter(id, "STRN", {0, 1}); + tuner.AddParameter(id, "SA", {0, 1}); + tuner.AddParameter(id, "SB", {0, 1}); + } } // Sets the constraints @@ -92,6 +112,14 @@ class TuneXgemm { // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; }; + tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"}); + tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"}); + tuner.AddConstraint(id, IsEqual, {"SA", "SB"}); + } } // Sets the local memory size @@ -145,15 +173,22 @@ class TuneXgemm { using float2 = clblast::float2; using double2 = clblast::double2; -// Main function (not within the clblast namespace) -int main(int argc, char *argv[]) { +// Function to tune a specific variation V (not within the clblast namespace) +template <int V> +void StartVariation(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2>, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half,V>, half>(argc, argv); break; + case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float,V>, float>(argc, argv); break; + case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double,V>, double>(argc, argv); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2,V>, float2>(argc, argv); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2,V>, double2>(argc, argv); break; } +} + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); return 0; } |