diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-28 17:32:37 +0200 |
commit | 12b08ae49154379f7471a40809ace6418857b387 (patch) | |
tree | ef958197db0bb8a67c9a5840f828b3f6c72bd8fc /src/tuning | |
parent | 2949e156f5bfdd724987e67477da3e3608e4aaf9 (diff) | |
parent | fa6e5e67f585b77d34c3031c176de9a0f7904aa9 (diff) |
Merge branch 'master' into android_support
Diffstat (limited to 'src/tuning')
-rw-r--r-- | src/tuning/kernels/copy_fast.cpp | 113 | ||||
-rw-r--r-- | src/tuning/kernels/copy_pad.cpp | 113 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_fast.cpp | 113 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_pad.cpp | 111 | ||||
-rw-r--r-- | src/tuning/kernels/xaxpy.cpp | 110 | ||||
-rw-r--r-- | src/tuning/kernels/xdot.cpp | 105 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.cpp | 202 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm_direct.cpp | 184 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.cpp | 131 | ||||
-rw-r--r-- | src/tuning/kernels/xger.cpp | 112 | ||||
-rw-r--r-- | src/tuning/tuning.hpp | 147 |
11 files changed, 681 insertions, 760 deletions
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index c9bf478c..068c5f1b 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -25,70 +25,64 @@ template <typename T> class TuneCopy { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "copy"; + settings.kernel_name = "CopyMatrixFast"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/copy_fast.opencl" + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}}; + settings.div_global = {{"COPY_VW", "COPY_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"COPY_DIMX", {8, 16, 32}}, + {"COPY_DIMY", {8, 16, 32}}, + {"COPY_WPT", {1, 2, 4, 8}}, + {"COPY_VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -99,17 +93,6 @@ class TuneCopy { tuner.AddArgumentOutput(b_mat); tuner.AddArgumentScalar(GetRealArg(args.alpha)); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 23f52d75..7102d05d 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -25,70 +25,64 @@ template <typename T> class TunePad { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "CopyPadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_pad.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "pad"; + settings.kernel_name = "CopyPadMatrix"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/copy_pad.opencl" + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}}; + settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PAD_DIMX", {8, 16, 32}}, + {"PAD_DIMY", {8, 16, 32}}, + {"PAD_WPTX", {1, 2, 4}}, + {"PAD_WPTY", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); - tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -107,17 +101,6 @@ class TunePad { tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 308663d8..56726903 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -25,53 +25,60 @@ template <typename T> class TuneTranspose { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "transpose"; + settings.kernel_name = "TransposeMatrixFast"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/transpose_fast.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"TRA_DIM", "TRA_DIM"}}; + settings.div_global = {{"TRA_WPT", "TRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"TRA_DIM", {4, 8, 16, 32, 64}}, + {"TRA_WPT", {1, 2, 4, 8, 16}}, + {"TRA_PAD", {0, 1}}, + {"TRA_SHUFFLE", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); - tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "TRA_PAD", {0, 1}); - tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { @@ -81,19 +88,6 @@ class TuneTranspose { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -104,17 +98,6 @@ class TuneTranspose { tuner.AddArgumentOutput(b_mat); tuner.AddArgumentScalar(GetRealArg(args.alpha)); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 304702de..dc46e903 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -25,52 +25,59 @@ template <typename T> class TunePadTranspose { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "TransposePadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_pad.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "padtranspose"; + settings.kernel_name = "TransposePadMatrix"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/transpose_pad.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}}; + settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PADTRA_TILE", {8, 16, 32, 64}}, + {"PADTRA_WPT", {1, 2, 4, 8, 16}}, + {"PADTRA_PAD", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); - tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) { @@ -80,19 +87,6 @@ class TunePadTranspose { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -111,17 +105,6 @@ class TunePadTranspose { tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index f8e1d93e..e201949a 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -25,19 +25,54 @@ template <typename T> class TuneXaxpy { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFastest"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/level1.opencl" - #include "../src/kernels/level1/xaxpy.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgN, kArgAlpha}; + settings.default_n = 4096*1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "xaxpy"; + settings.kernel_name = "XaxpyFastest"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level1/level1.opencl" +#include "../src/kernels/level1/xaxpy.opencl" + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + + // Sets the base thread configuration + settings.global_size = {args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"}}; + settings.div_global = {{"WPT"},{"VW"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS", {64, 128, 256, 512, 1024, 2048}}, + {"WPT", {1, 2, 4, 8}}, + {"VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 3 * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &args) { @@ -46,52 +81,10 @@ class TuneXaxpy { } } - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;} // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &args) { return args.n; } - static size_t GetSizeY(const Arguments<T> &args) { return args.n; } - static size_t GetSizeA(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "VW", {1, 2, 4, 8}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1}; } - static std::vector<size_t> LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"WGS"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, @@ -102,17 +95,6 @@ class TuneXaxpy { tuner.AddArgumentInput(x_vec); tuner.AddArgumentOutput(y_vec); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 3 * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index c3b5361e..fb532680 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -26,66 +26,60 @@ template <typename T, int V> class TuneXdot { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xdot_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/xdot.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgN}; + settings.default_n = 2*1024*1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgN}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Identification of the kernel + settings.kernel_family = "xdot_"+std::to_string(V); + settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level1/xdot.opencl" + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + settings.size_temp = args.n; // Worst case + + // Sets the base thread configuration + settings.global_size = (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1}; + settings.global_size_ref = (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64}; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); + settings.performance_unit = (V==1) ? "GB/s" : "N/A"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 2*1024*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &args) { return args.n; } - static size_t GetSizeY(const Arguments<T> &args) { return args.n; } - static size_t GetSizeA(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &args) { return args.n; } // Worst case - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &) { return (V==1) ? std::vector<size_t>{2*64} : std::vector<size_t>{1}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &) { return (V==1) ? std::vector<size_t>{2*64*64} : std::vector<size_t>{64}; } - static std::vector<size_t> LocalSize() { return {1}; } - static std::vector<size_t> LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivGlobal() { return {}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, @@ -108,17 +102,6 @@ class TuneXdot { tuner.AddArgumentScalar(0); } } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index fa6b3085..6dcdf68b 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -27,88 +27,111 @@ template <typename T, int V> class TuneXgemm { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; } - static std::string KernelName() { return "Xgemm"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_part1.opencl" - #include "../src/kernels/level3/xgemm_part2.opencl" - #include "../src/kernels/level3/xgemm_part3.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 1024; + settings.default_n = 1024; + settings.default_k = 1024; + settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly + settings.default_num_runs = 2; + settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch); + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; + settings.kernel_name = "Xgemm"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/xgemm_part1.opencl" +#include "../src/kernels/level3/xgemm_part2.opencl" +#include "../src/kernels/level3/xgemm_part3.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1024; } - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } - static double DefaultInfluenceGlobalPSO(){ return 0.1; } - static double DefaultInfluenceLocalPSO(){ return 0.3; } - static double DefaultInfluenceRandomPSO(){ return 0.6; } - static size_t DefaultHeuristic(){ return static_cast<size_t>(cltune::SearchMethod::RandomSearch); } - static double DefaultMaxTempAnn(){ return 1.0;} - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMC", "NDIMC"}}; + settings.mul_global = {{"MDIMC", "NDIMC"}}; + settings.div_global = {{"MWG", "NWG"}}; + + // Sets the tuning parameters and their possible values if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - tuner.AddParameter(id, "MWG", {16, 32, 64}); - tuner.AddParameter(id, "NWG", {16, 32, 64}); - tuner.AddParameter(id, "KWG", {32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2}); - tuner.AddParameter(id, "VWM", {1, 2, 4}); - tuner.AddParameter(id, "VWN", {1, 2, 4}); - tuner.AddParameter(id, "STRM", {0}); - tuner.AddParameter(id, "STRN", {0}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); - } // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"MWG", {16, 32, 64}}, + {"NWG", {16, 32, 64}}, + {"KWG", {32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4}}, + {"VWN", {1, 2, 4}}, + {"STRM", {0}}, + {"STRN", {0}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; + } + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"MWG", {16, 32, 64, 128}}, + {"NWG", {16, 32, 64, 128}}, + {"KWG", {16, 32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4, 8}}, + {"VWN", {1, 2, 4, 8}}, + {"STRM", {0, 1}}, + {"STRN", {0, 1}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; + } + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + // Returns which search heuristic to use + if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); } else { - //RANDOM_SEARCH & PSO - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); + // Use full-search to explore all parameter combinations or another strategy to search only a + // part of the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); + } else { + settings.heuristic = args.heuristic_selection; + } } + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; @@ -144,19 +167,6 @@ class TuneXgemm { "SB", "KWG", "NWG"}); } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -170,27 +180,9 @@ class TuneXgemm { tuner.AddArgumentInput(a_mat); tuner.AddArgumentInput(b_mat); tuner.AddArgumentOutput(c_mat); + tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(0); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - if (V==1) { return static_cast<size_t>(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - return static_cast<size_t>(cltune::SearchMethod::FullSearch); - } else { - return args.heuristic_selection; - } - } - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 03b40a50..619fb37a 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -27,78 +27,103 @@ template <typename T, int V> class TuneXgemmDirect { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; } - static std::string KernelName() { return "XgemmDirectTN"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_direct_part1.opencl" - #include "../src/kernels/level3/xgemm_direct_part2.opencl" - #include "../src/kernels/level3/xgemm_direct_part3.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 256; + settings.default_n = 256; + settings.default_k = 256; + settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly + settings.default_num_runs = 4; + settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch); + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; + settings.kernel_name = "XgemmDirectTN"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/xgemm_direct_part1.opencl" +#include "../src/kernels/level3/xgemm_direct_part2.opencl" +#include "../src/kernels/level3/xgemm_direct_part3.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; - // Sets the default values for the arguments - static size_t DefaultM() { return 256; } - static size_t DefaultN() { return 256; } - static size_t DefaultK() { return 256; } - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } - static double DefaultInfluenceGlobalPSO(){ return 0.1; } - static double DefaultInfluenceLocalPSO(){ return 0.3; } - static double DefaultInfluenceRandomPSO(){ return 0.6; } - static size_t DefaultHeuristic(){ return static_cast<size_t>(cltune::SearchMethod::RandomSearch);} - static double DefaultMaxTempAnn(){ return 1.0;} - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMCD", "NDIMCD"}}; + settings.mul_global = {{"MDIMCD", "NDIMCD"}}; + settings.div_global = {{"WGD", "WGD"}}; + + // Sets the tuning parameters and their possible values if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - tuner.AddParameter(id, "WGD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); - tuner.AddParameter(id, "KWID", {2}); - tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); - tuner.AddParameter(id, "PADA", {1}); - tuner.AddParameter(id, "PADB", {1}); - } // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"WGD", {8, 16, 32}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {1}}, + {"PADB", {1}}, + }; + } + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"WGD", {8, 16, 32, 64, 128}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2, 8, 16}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {0, 1}}, + {"PADB", {0, 1}}, + }; + } + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + // Returns which search heuristic to use + if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); } else { - tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); - tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); - tuner.AddParameter(id, "KWID", {2, 8, 16}); - tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); - tuner.AddParameter(id, "PADA", {0, 1}); - tuner.AddParameter(id, "PADB", {0, 1}); + // Use full-search to explore all parameter combinations or another strategy to search only a + // part of the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); + } else { + settings.heuristic = args.heuristic_selection; + } } + + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); }; @@ -132,19 +157,6 @@ class TuneXgemmDirect { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; } - static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &, std::vector<T> &, @@ -168,26 +180,6 @@ class TuneXgemmDirect { tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - if (V==1) { return static_cast<size_t>(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - return static_cast<size_t>(cltune::SearchMethod::FullSearch); - } else { - return args.heuristic_selection; - } - } - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 00115b6c..e66b15f1 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -28,63 +28,77 @@ template <typename T, int V> class TuneXgemv { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); } - static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/xgemv.opencl" - #include "../src/kernels/level2/xgemv_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha, kArgBeta}; + settings.default_m = 2048; + settings.default_n = 2048; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); + settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level2/xgemv.opencl" +#include "../src/kernels/level2/xgemv_fast.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments<T> &) { } + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.m; + settings.size_a = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; - // Sets the default values for the arguments - static size_t DefaultM() { return 2048; } - static size_t DefaultN() { return 2048; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &args) { return args.n; } - static size_t GetSizeY(const Arguments<T> &args) { return args.m; } - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"+std::to_string(V)}}; + settings.div_global = (V==1 || V==2) ? + TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} : + TunerSettings::TransformVector{}; + + // Sets the tuning parameters and their possible values if (V==1) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + }; } if (V==2) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); - tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; } if (V==3) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); - tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128}}, + {"WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; } + + // Describes how to compute the performance metrics + settings.metric_amount = (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments<T> &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { if (V==2 || V==3) { @@ -107,22 +121,6 @@ class TuneXgemv { } } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1}; } - static std::vector<size_t> LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { - if (V==1 || V==2) return {{"WPT"+std::to_string(V)}}; - return {}; - } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, @@ -148,17 +146,6 @@ class TuneXgemv { tuner.AddArgumentScalar(0); // Banded 'kl' tuner.AddArgumentScalar(0); // Banded 'ku' } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 14a98761..c2eb1d31 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -25,69 +25,64 @@ template <typename T> class TuneXger { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xger"; } - static std::string KernelName() { return "Xger"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/level2.opencl" - #include "../src/kernels/level2/xger.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector<std::string> GetOptions() { return {kArgN, kArgM, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments<T> &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "xger"; + settings.kernel_name = "Xger"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level2/level2.opencl" +#include "../src/kernels/level2/xger.opencl" + ; + + // Buffer sizes + settings.size_x = args.m; + settings.size_y = args.n; + settings.size_a = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS1", "WGS2"}}; + settings.div_global = {{"WPT", "WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS1", {4, 8, 16, 32, 64, 128, 256, 512}}, + {"WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}}, + {"WPT", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments<T> &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; } // N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast<size_t> (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments<T> &args) { return args.m; } - static size_t GetSizeY(const Arguments<T> &args) { return args.n; } - static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments<T> &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); - tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments<T> &) { } - // Sets the base thread configuration - static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; } - static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); } - static std::vector<size_t> LocalSize() { return {1, 1}; } - static std::vector<size_t> LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector<std::vector<std::string>>; - static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args, std::vector<T> &x_vec, std::vector<T> &y_vec, @@ -107,17 +102,6 @@ class TuneXger { tuner.AddArgumentScalar(static_cast<int>(args.m)); // a_ld tuner.AddArgumentScalar(0); // a_is_rowmajor } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments<T> &args) { - return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments<T> &args){ - return static_cast<size_t> (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index 1f9b6f4f..bc9c0e03 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -18,6 +18,7 @@ #include <vector> #include <string> #include <random> +#include <utility> #include <cltune.h> @@ -26,6 +27,73 @@ namespace clblast { // ================================================================================================= +// Structures for the tuners with all the default settings +struct TunerDefaults { + + // The list of arguments relevant for this routine + std::vector<std::string> options = {}; + + // Default sizes + size_t default_m = 1; + size_t default_n = 1; + size_t default_k = 1; + + // Other defaults + size_t default_batch_count = 1; + size_t default_num_runs = 10; // run every kernel this many times for averaging + + // Search heuristic defaults + double default_fraction = 1.0; + size_t default_swarm_size_PSO = 8; + double default_influence_global_PSO = 0.1; + double default_influence_local_PSO = 0.3; + double default_influence_random_PSO = 0.6; + size_t default_heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); + double default_max_temp_ann = 1.0; +}; + +// Structures for the tuners with the remaining settings +struct TunerSettings { + + // The representative kernel and the source code + std::string kernel_family; + std::string kernel_name; + std::string sources; + + // Describes how to obtain the sizes of the buffers + size_t size_x = 1; + size_t size_y = 1; + size_t size_a = 1; + size_t size_b = 1; + size_t size_c = 1; + size_t size_temp = 1; + + // Sets the base thread configuration + std::vector<size_t> global_size = {}; + std::vector<size_t> global_size_ref = {}; + std::vector<size_t> local_size = {}; + std::vector<size_t> local_size_ref = {}; + + // Transforms the thread configuration based on the parameters + using TransformVector = std::vector<std::vector<std::string>>; + TransformVector mul_local = {}; + TransformVector div_local = {}; + TransformVector mul_global = {}; + TransformVector div_global = {}; + + // Sets the tuning parameters and their possible values + std::vector<std::pair<std::string, std::vector<size_t>>> parameters; + + // Describes how to compute the performance metrics + size_t metric_amount = 0; + std::string performance_unit = "N/A"; + + // Returns which search heuristic to use + size_t heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); +}; + +// ================================================================================================= + // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for all types of kernel families. Note that this is a header-only function so // that it is automatically compiled for the various kernels (given as the 'C' template argument). @@ -34,30 +102,31 @@ void Tuner(int argc, char* argv[]) { constexpr auto kSeed = 42; // fixed seed for reproducibility // Sets the parameters and platform/device for which to tune (command-line options) + const TunerDefaults defaults = C::GetTunerDefaults(); auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto help = std::string{"* Options given/available:\n"}; auto args = Arguments<T>{}; args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); - for (auto &o: C::GetOptions()) { - if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, C::DefaultM()); } - if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, C::DefaultN()); } - if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, C::DefaultK()); } + for (auto &o: defaults.options) { + if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); } + if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); } + if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); } - if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, C::DefaultFraction()); } - if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, C::DefaultBatchCount()); } - if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, C::DefaultHeuristic()); } - if (o == kArgPsoSwarmSize) {args.pso_swarm_size = GetArgument(command_line_args, help, kArgPsoSwarmSize , C::DefaultSwarmSizePSO()); } - if (o == kArgPsoInfGlobal) {args.pso_inf_global = GetArgument(command_line_args, help, kArgPsoInfGlobal, C::DefaultInfluenceGlobalPSO()); } - if (o == kArgPsoInfLocal) {args.pso_inf_local = GetArgument(command_line_args, help, kArgPsoInfLocal, C::DefaultInfluenceLocalPSO()); } - if (o == kArgPsoInfRandom) {args.pso_inf_random = GetArgument(command_line_args, help, kArgPsoInfRandom, C::DefaultInfluenceRandomPSO()); } - if (o == kArgAnnMaxTemp) {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, C::DefaultMaxTempAnn());} + if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); } + if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } + if (o == kArgHeuristicSelection) {args.heuristic_selection = GetArgument(command_line_args, help, kArgHeuristicSelection, defaults.default_heuristic); } + if (o == kArgPsoSwarmSize) {args.pso_swarm_size = GetArgument(command_line_args, help, kArgPsoSwarmSize , defaults.default_swarm_size_PSO); } + if (o == kArgPsoInfGlobal) {args.pso_inf_global = GetArgument(command_line_args, help, kArgPsoInfGlobal, defaults.default_influence_global_PSO); } + if (o == kArgPsoInfLocal) {args.pso_inf_local = GetArgument(command_line_args, help, kArgPsoInfLocal, defaults.default_influence_local_PSO); } + if (o == kArgPsoInfRandom) {args.pso_inf_random = GetArgument(command_line_args, help, kArgPsoInfRandom, defaults.default_influence_random_PSO); } + if (o == kArgAnnMaxTemp) {args.ann_max_temperature = GetArgument(command_line_args, help, kArgAnnMaxTemp, defaults.default_max_temp_ann); } } - const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, C::DefaultNumRuns()); - + const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); fprintf(stdout, "%s\n", help.c_str()); + const TunerSettings settings = C::GetTunerSettings(args); // Tests validity of the given arguments C::TestValidArguments(args); @@ -87,12 +156,12 @@ void Tuner(int argc, char* argv[]) { } // Creates input buffers with random data - auto x_vec = std::vector<T>(C::GetSizeX(args)); - auto y_vec = std::vector<T>(C::GetSizeY(args)); - auto a_mat = std::vector<T>(C::GetSizeA(args)); - auto b_mat = std::vector<T>(C::GetSizeB(args)); - auto c_mat = std::vector<T>(C::GetSizeC(args)); - auto temp = std::vector<T>(C::GetSizeTemp(args)); + auto x_vec = std::vector<T>(settings.size_x); + auto y_vec = std::vector<T>(settings.size_y); + auto a_mat = std::vector<T>(settings.size_a); + auto b_mat = std::vector<T>(settings.size_b); + auto c_mat = std::vector<T>(settings.size_c); + auto temp = std::vector<T>(settings.size_temp); std::mt19937 mt(kSeed); std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_vec, mt, dist); @@ -105,15 +174,13 @@ void Tuner(int argc, char* argv[]) { // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); - // Select the search method based on the cmd_line arguments - // If the tuner does not support the selected choice, Full Search will be returned. - auto method = C::GetHeuristic(args); - + // Select the search method based on the command-line arguments + // If the tuner does not support the selected choice, full search will be returned. + auto method = settings.heuristic; if (method == 1) { tuner.UseRandomSearch(1.0/args.fraction); } else if (method == 2) { tuner.UseAnnealing(1.0/args.fraction, args.ann_max_temperature); } - else if (method == 3) { - tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global, args.pso_inf_local, args.pso_inf_random); - } + else if (method == 3) { tuner.UsePSO(1.0/args.fraction, args.pso_swarm_size, args.pso_inf_global, + args.pso_inf_local, args.pso_inf_random); } else { tuner.UseFullSearch(); } // Set extra settings for specific defines. This mimics src/routine.cc. @@ -127,12 +194,14 @@ void Tuner(int argc, char* argv[]) { } // Loads the kernel sources and defines the kernel to tune - auto sources = defines + C::GetSources(); - auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); - tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); + auto sources = defines + settings.sources; + auto id = tuner.AddKernelFromString(sources, settings.kernel_name, settings.global_size, settings.local_size); + tuner.SetReferenceFromString(sources, settings.kernel_name, settings.global_size_ref, settings.local_size_ref); // Sets the tunable parameters and their possible values - C::SetParameters(tuner, id); + for (const auto ¶meter: settings.parameters) { + tuner.AddParameter(id, parameter.first, parameter.second); + } C::SetConstraints(tuner, id); C::SetLocalMemorySize(tuner, id, args); @@ -141,10 +210,10 @@ void Tuner(int argc, char* argv[]) { tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); // Modifies the thread-sizes (both global and local) based on the parameters - for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } - for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } - for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } - for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } + for (auto ¶meters: settings.mul_local) { tuner.MulLocalSize(id, parameters); } + for (auto ¶meters: settings.div_local) { tuner.DivLocalSize(id, parameters); } + for (auto ¶meters: settings.mul_global) { tuner.MulGlobalSize(id, parameters); } + for (auto ¶meters: settings.div_global) { tuner.DivGlobalSize(id, parameters); } // Sets the function's arguments C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); @@ -160,20 +229,20 @@ void Tuner(int argc, char* argv[]) { // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { printf("[ -------> ] %.2lf ms", time_ms); - printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); + printf(" or %.1lf %s\n", settings.metric_amount/(time_ms*1.0e6), settings.performance_unit.c_str()); } // Outputs the results as JSON to disk, including some meta-data auto precision_string = std::to_string(static_cast<size_t>(args.precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ - {"kernel_family", C::KernelFamily()}, + {"kernel_family", settings.kernel_family}, {"precision", precision_string}, {"clblast_device_type", device_type}, {"clblast_device_vendor", device_vendor}, {"clblast_device_architecture", device_architecture}, {"clblast_device_name", device_name} }; - for (auto &o: C::GetOptions()) { + for (auto &o: defaults.options) { if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } @@ -181,7 +250,7 @@ void Tuner(int argc, char* argv[]) { if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } } - tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); + tuner.PrintJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", metadata); } |