From c151ab1325bc796aed386f456258b1b8b05aefa6 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 30 Sep 2017 20:26:26 +0200 Subject: Refactored the tuning architecture: less duplicate now; more defaults --- src/tuning/kernels/copy_fast.cpp | 113 ++++++++----------- src/tuning/kernels/copy_pad.cpp | 113 ++++++++----------- src/tuning/kernels/transpose_fast.cpp | 113 ++++++++----------- src/tuning/kernels/transpose_pad.cpp | 111 ++++++++----------- src/tuning/kernels/xaxpy.cpp | 110 ++++++++----------- src/tuning/kernels/xdot.cpp | 105 ++++++++---------- src/tuning/kernels/xgemm.cpp | 200 ++++++++++++++++------------------ src/tuning/kernels/xgemm_direct.cpp | 184 +++++++++++++++---------------- src/tuning/kernels/xgemv.cpp | 131 ++++++++++------------ src/tuning/kernels/xger.cpp | 112 ++++++++----------- 10 files changed, 571 insertions(+), 721 deletions(-) (limited to 'src/tuning/kernels') diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index c9bf478c..068c5f1b 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -25,70 +25,64 @@ template class TuneCopy { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "copy"; } - static std::string KernelName() { return "CopyMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "copy"; + settings.kernel_name = "CopyMatrixFast"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/copy_fast.opencl" + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}}; + settings.div_global = {{"COPY_VW", "COPY_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"COPY_DIMX", {8, 16, 32}}, + {"COPY_DIMY", {8, 16, 32}}, + {"COPY_WPT", {1, 2, 4, 8}}, + {"COPY_VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "COPY_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "COPY_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "COPY_WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "COPY_VW", {1, 2, 4, 8}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"COPY_DIMX", "COPY_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"COPY_VW", "COPY_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -99,17 +93,6 @@ class TuneCopy { tuner.AddArgumentOutput(b_mat); tuner.AddArgumentScalar(GetRealArg(args.alpha)); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 23f52d75..7102d05d 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -25,70 +25,64 @@ template class TunePad { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "pad"; } - static std::string KernelName() { return "CopyPadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/copy_pad.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "pad"; + settings.kernel_name = "CopyPadMatrix"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/copy_pad.opencl" + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}}; + settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PAD_DIMX", {8, 16, 32}}, + {"PAD_DIMY", {8, 16, 32}}, + {"PAD_WPTX", {1, 2, 4}}, + {"PAD_WPTY", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PAD_DIMX", {8, 16, 32}); - tuner.AddParameter(id, "PAD_DIMY", {8, 16, 32}); - tuner.AddParameter(id, "PAD_WPTX", {1, 2, 4}); - tuner.AddParameter(id, "PAD_WPTY", {1, 2, 4}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PAD_DIMX", "PAD_DIMY"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PAD_WPTX", "PAD_WPTY"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -107,17 +101,6 @@ class TunePad { tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 308663d8..56726903 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -25,53 +25,60 @@ template class TuneTranspose { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "transpose"; } - static std::string KernelName() { return "TransposeMatrixFast"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "transpose"; + settings.kernel_name = "TransposeMatrixFast"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/transpose_fast.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"TRA_DIM", "TRA_DIM"}}; + settings.div_global = {{"TRA_WPT", "TRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"TRA_DIM", {4, 8, 16, 32, 64}}, + {"TRA_WPT", {1, 2, 4, 8, 16}}, + {"TRA_PAD", {0, 1}}, + {"TRA_SHUFFLE", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "TRA_DIM", {4, 8, 16, 32, 64}); - tuner.AddParameter(id, "TRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "TRA_PAD", {0, 1}); - tuner.AddParameter(id, "TRA_SHUFFLE", {0, 1}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { @@ -81,19 +88,6 @@ class TuneTranspose { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"TRA_DIM", "TRA_DIM"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"TRA_WPT", "TRA_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -104,17 +98,6 @@ class TuneTranspose { tuner.AddArgumentOutput(b_mat); tuner.AddArgumentScalar(GetRealArg(args.alpha)); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 304702de..dc46e903 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -25,52 +25,59 @@ template class TunePadTranspose { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "padtranspose"; } - static std::string KernelName() { return "TransposePadMatrix"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/level3.opencl" - #include "../src/kernels/level3/transpose_pad.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "padtranspose"; + settings.kernel_name = "TransposePadMatrix"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/level3.opencl" +#include "../src/kernels/level3/transpose_pad.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}}; + settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PADTRA_TILE", {8, 16, 32, 64}}, + {"PADTRA_WPT", {1, 2, 4, 8, 16}}, + {"PADTRA_PAD", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "PADTRA_TILE", {8, 16, 32, 64}); - tuner.AddParameter(id, "PADTRA_WPT", {1, 2, 4, 8, 16}); - tuner.AddParameter(id, "PADTRA_PAD", {0, 1}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { @@ -80,19 +87,6 @@ class TunePadTranspose { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"PADTRA_TILE", "PADTRA_TILE"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"PADTRA_WPT", "PADTRA_WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -111,17 +105,6 @@ class TunePadTranspose { tuner.AddArgumentScalar(GetRealArg(args.alpha)); tuner.AddArgumentScalar(0); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index f8e1d93e..e201949a 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -25,19 +25,54 @@ template class TuneXaxpy { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xaxpy"; } - static std::string KernelName() { return "XaxpyFastest"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/level1.opencl" - #include "../src/kernels/level1/xaxpy.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgN, kArgAlpha}; + settings.default_n = 4096*1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "xaxpy"; + settings.kernel_name = "XaxpyFastest"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level1/level1.opencl" +#include "../src/kernels/level1/xaxpy.opencl" + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + + // Sets the base thread configuration + settings.global_size = {args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"}}; + settings.div_global = {{"WPT"},{"VW"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS", {64, 128, 256, 512, 1024, 2048}}, + {"WPT", {1, 2, 4, 8}}, + {"VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 3 * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments &args) { @@ -46,52 +81,10 @@ class TuneXaxpy { } } - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 4096*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;} // N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); - tuner.AddParameter(id, "VW", {1, 2, 4, 8}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT"},{"VW"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, @@ -102,17 +95,6 @@ class TuneXaxpy { tuner.AddArgumentInput(x_vec); tuner.AddArgumentOutput(y_vec); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 3 * args.n * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index c3b5361e..fb532680 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -26,66 +26,60 @@ template class TuneXdot { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xdot_"+std::to_string(V); } - static std::string KernelName() { return (V==1) ? "Xdot" : "XdotEpilogue"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level1/xdot.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgN}; + settings.default_n = 2*1024*1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Identification of the kernel + settings.kernel_family = "xdot_"+std::to_string(V); + settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level1/xdot.opencl" + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + settings.size_temp = args.n; // Worst case + + // Sets the base thread configuration + settings.global_size = (V==1) ? std::vector{2*64} : std::vector{1}; + settings.global_size_ref = (V==1) ? std::vector{2*64*64} : std::vector{64}; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); + settings.performance_unit = (V==1) ? "GB/s" : "N/A"; - // Sets the default values for the arguments - static size_t DefaultM() { return 1; } // N/A for this kernel - static size_t DefaultN() { return 2*1024*1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &args) { return args.n; } // Worst case - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}); + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &) { return (V==1) ? std::vector{2*64} : std::vector{1}; } - static std::vector GlobalSizeRef(const Arguments &) { return (V==1) ? std::vector{2*64*64} : std::vector{64}; } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; } - static TransformVector DivGlobal() { return {}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, @@ -108,17 +102,6 @@ class TuneXdot { tuner.AddArgumentScalar(0); } } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return (V==1) ? "GB/s" : "N/A"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index fa6b3085..7d0f3ed4 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -27,88 +27,111 @@ template class TuneXgemm { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; } - static std::string KernelName() { return "Xgemm"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_part1.opencl" - #include "../src/kernels/level3/xgemm_part2.opencl" - #include "../src/kernels/level3/xgemm_part3.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 1024; + settings.default_n = 1024; + settings.default_k = 1024; + settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly + settings.default_num_runs = 2; + settings.default_heuristic = static_cast(cltune::SearchMethod::RandomSearch); + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; + settings.kernel_name = "Xgemm"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/xgemm_part1.opencl" +#include "../src/kernels/level3/xgemm_part2.opencl" +#include "../src/kernels/level3/xgemm_part3.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1024; } - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } - static double DefaultInfluenceGlobalPSO(){ return 0.1; } - static double DefaultInfluenceLocalPSO(){ return 0.3; } - static double DefaultInfluenceRandomPSO(){ return 0.6; } - static size_t DefaultHeuristic(){ return static_cast(cltune::SearchMethod::RandomSearch); } - static double DefaultMaxTempAnn(){ return 1.0;} - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMC", "NDIMC"}}; + settings.mul_global = {{"MDIMC", "NDIMC"}}; + settings.div_global = {{"MWG", "NWG"}}; + + // Sets the tuning parameters and their possible values if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - tuner.AddParameter(id, "MWG", {16, 32, 64}); - tuner.AddParameter(id, "NWG", {16, 32, 64}); - tuner.AddParameter(id, "KWG", {32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2}); - tuner.AddParameter(id, "VWM", {1, 2, 4}); - tuner.AddParameter(id, "VWN", {1, 2, 4}); - tuner.AddParameter(id, "STRM", {0}); - tuner.AddParameter(id, "STRN", {0}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); - } // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"MWG", {16, 32, 64}}, + {"NWG", {16, 32, 64}}, + {"KWG", {32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4}}, + {"VWN", {1, 2, 4}}, + {"STRM", {0}}, + {"STRN", {0}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; + } + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"MWG", {16, 32, 64, 128}}, + {"NWG", {16, 32, 64, 128}}, + {"KWG", {16, 32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4, 8}}, + {"VWN", {1, 2, 4, 8}}, + {"STRM", {0, 1}}, + {"STRN", {0, 1}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; + } + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + // Returns which search heuristic to use + if (V==1) { settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); } else { - //RANDOM_SEARCH & PSO - tuner.AddParameter(id, "MWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "NWG", {16, 32, 64, 128}); - tuner.AddParameter(id, "KWG", {16, 32}); - tuner.AddParameter(id, "MDIMC", {8, 16, 32}); - tuner.AddParameter(id, "NDIMC", {8, 16, 32}); - tuner.AddParameter(id, "MDIMA", {8, 16, 32}); - tuner.AddParameter(id, "NDIMB", {8, 16, 32}); - tuner.AddParameter(id, "KWI", {2}); - tuner.AddParameter(id, "VWM", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWN", {1, 2, 4, 8}); - tuner.AddParameter(id, "STRM", {0, 1}); - tuner.AddParameter(id, "STRN", {0, 1}); - tuner.AddParameter(id, "SA", {0, 1}); - tuner.AddParameter(id, "SB", {0, 1}); + // Use full-search to explore all parameter combinations or another strategy to search only a + // part of the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); + } else { + settings.heuristic = args.heuristic_selection; + } } + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; @@ -144,19 +167,6 @@ class TuneXgemm { "SB", "KWG", "NWG"}); } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; } - static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -171,26 +181,6 @@ class TuneXgemm { tuner.AddArgumentInput(b_mat); tuner.AddArgumentOutput(c_mat); } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - if (V==1) { return static_cast(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - return static_cast(cltune::SearchMethod::FullSearch); - } else { - return args.heuristic_selection; - } - } - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 03b40a50..619fb37a 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -27,78 +27,103 @@ template class TuneXgemmDirect { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; } - static std::string KernelName() { return "XgemmDirectTN"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level3/xgemm_direct_part1.opencl" - #include "../src/kernels/level3/xgemm_direct_part2.opencl" - #include "../src/kernels/level3/xgemm_direct_part3.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 256; + settings.default_n = 256; + settings.default_k = 256; + settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly + settings.default_num_runs = 4; + settings.default_heuristic = static_cast(cltune::SearchMethod::RandomSearch); + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { - return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; + settings.kernel_name = "XgemmDirectTN"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level3/xgemm_direct_part1.opencl" +#include "../src/kernels/level3/xgemm_direct_part2.opencl" +#include "../src/kernels/level3/xgemm_direct_part3.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; - // Sets the default values for the arguments - static size_t DefaultM() { return 256; } - static size_t DefaultN() { return 256; } - static size_t DefaultK() { return 256; } - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly - static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } - static double DefaultInfluenceGlobalPSO(){ return 0.1; } - static double DefaultInfluenceLocalPSO(){ return 0.3; } - static double DefaultInfluenceRandomPSO(){ return 0.6; } - static size_t DefaultHeuristic(){ return static_cast(cltune::SearchMethod::RandomSearch);} - static double DefaultMaxTempAnn(){ return 1.0;} - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeY(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeA(const Arguments &args) { return args.m * args.k; } - static size_t GetSizeB(const Arguments &args) { return args.n * args.k; } - static size_t GetSizeC(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMCD", "NDIMCD"}}; + settings.mul_global = {{"MDIMCD", "NDIMCD"}}; + settings.div_global = {{"WGD", "WGD"}}; + + // Sets the tuning parameters and their possible values if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - tuner.AddParameter(id, "WGD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); - tuner.AddParameter(id, "KWID", {2}); - tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); - tuner.AddParameter(id, "PADA", {1}); - tuner.AddParameter(id, "PADB", {1}); - } // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"WGD", {8, 16, 32}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {1}}, + {"PADB", {1}}, + }; + } + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"WGD", {8, 16, 32, 64, 128}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2, 8, 16}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {0, 1}}, + {"PADB", {0, 1}}, + }; + } + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + // Returns which search heuristic to use + if (V==1) { settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); } else { - tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128}); - tuner.AddParameter(id, "MDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMCD", {8, 16, 32}); - tuner.AddParameter(id, "MDIMAD", {8, 16, 32}); - tuner.AddParameter(id, "NDIMBD", {8, 16, 32}); - tuner.AddParameter(id, "KWID", {2, 8, 16}); - tuner.AddParameter(id, "VWMD", {1, 2, 4, 8}); - tuner.AddParameter(id, "VWND", {1, 2, 4, 8}); - tuner.AddParameter(id, "PADA", {0, 1}); - tuner.AddParameter(id, "PADB", {0, 1}); + // Use full-search to explore all parameter combinations or another strategy to search only a + // part of the parameter values. The fraction is set as a command-line argument. + if (args.fraction == 1.0 || args.fraction == 0.0) { + settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); + } else { + settings.heuristic = args.heuristic_selection; + } } + + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints static void SetConstraints(cltune::Tuner &tuner, const size_t id) { auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; @@ -132,19 +157,6 @@ class TuneXgemmDirect { tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; } - static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &, std::vector &, @@ -168,26 +180,6 @@ class TuneXgemmDirect { tuner.AddArgumentScalar(0); // a_conjugate tuner.AddArgumentScalar(0); // b_conjugate } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return 2 * args.m * args.n * args.k; - } - static std::string PerformanceUnit() { return "GFLOPS"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - if (V==1) { return static_cast(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - return static_cast(cltune::SearchMethod::FullSearch); - } else { - return args.heuristic_selection; - } - } - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 00115b6c..e66b15f1 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -28,63 +28,77 @@ template class TuneXgemv { public: - // The representative kernel and the source code - static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); } - static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/xgemv.opencl" - #include "../src/kernels/level2/xgemv_fast.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha, kArgBeta}; + settings.default_m = 2048; + settings.default_n = 2048; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgM, kArgN, kArgAlpha, kArgBeta}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); + settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level2/xgemv.opencl" +#include "../src/kernels/level2/xgemv_fast.opencl" + ; - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.m; + settings.size_a = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; - // Sets the default values for the arguments - static size_t DefaultM() { return 2048; } - static size_t DefaultN() { return 2048; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; }// N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.n; } - static size_t GetSizeY(const Arguments &args) { return args.m; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"+std::to_string(V)}}; + settings.div_global = (V==1 || V==2) ? + TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} : + TunerSettings::TransformVector{}; + + // Sets the tuning parameters and their possible values if (V==1) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + }; } if (V==2) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); - tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; } if (V==3) { - tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128}); - tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); - tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128}}, + {"WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; } + + // Describes how to compute the performance metrics + settings.metric_amount = (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; } + // Tests for valid arguments + static void TestValidArguments(const Arguments &) { } + // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &tuner, const size_t id) { if (V==2 || V==3) { @@ -107,22 +121,6 @@ class TuneXgemv { } } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1}; } - static std::vector LocalSizeRef() { return {64}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { - if (V==1 || V==2) return {{"WPT"+std::to_string(V)}}; - return {}; - } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, @@ -148,17 +146,6 @@ class TuneXgemv { tuner.AddArgumentScalar(0); // Banded 'kl' tuner.AddArgumentScalar(0); // Banded 'ku' } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 14a98761..c2eb1d31 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -25,69 +25,64 @@ template class TuneXger { public: - // The representative kernel and the source code - static std::string KernelFamily() { return "xger"; } - static std::string KernelName() { return "Xger"; } - static std::string GetSources() { - return - #include "../src/kernels/common.opencl" - #include "../src/kernels/level2/level2.opencl" - #include "../src/kernels/level2/xger.opencl" - ; + // Settings for this kernel (default command-line arguments) + static TunerDefaults GetTunerDefaults() { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; } - // The list of arguments relevant for this routine - static std::vector GetOptions() { return {kArgN, kArgM, kArgAlpha}; } + // Settings for this kernel (general) + static TunerSettings GetTunerSettings(const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "xger"; + settings.kernel_name = "Xger"; + settings.sources = +#include "../src/kernels/common.opencl" +#include "../src/kernels/level2/level2.opencl" +#include "../src/kernels/level2/xger.opencl" + ; + + // Buffer sizes + settings.size_x = args.m; + settings.size_y = args.n; + settings.size_a = args.m * args.n; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS1", "WGS2"}}; + settings.div_global = {{"WPT", "WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS1", {4, 8, 16, 32, 64, 128, 256, 512}}, + {"WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}}, + {"WPT", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; + } // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - // Sets the default values for the arguments - static size_t DefaultM() { return 1024; } - static size_t DefaultN() { return 1024; } - static size_t DefaultK() { return 1; } // N/A for this kernel - static size_t DefaultBatchCount() { return 1; } // N/A for this kernel - static double DefaultFraction() { return 1.0; } // N/A for this kernel - static size_t DefaultNumRuns() { return 10; } // run every kernel this many times for averaging - static size_t DefaultSwarmSizePSO() { return 8; } // N/A for this kernel - static double DefaultInfluenceGlobalPSO(){ return 0.1; }// N/A for this kernel - static double DefaultInfluenceLocalPSO(){ return 0.3; } // N/A for this kernel - static double DefaultInfluenceRandomPSO(){ return 0.6; }// N/A for this kernel - static size_t DefaultHeuristic(){ return static_cast (cltune::SearchMethod::FullSearch);} - static double DefaultMaxTempAnn(){ return 1.0;}// N/A for this kernel - - // Describes how to obtain the sizes of the buffers - static size_t GetSizeX(const Arguments &args) { return args.m; } - static size_t GetSizeY(const Arguments &args) { return args.n; } - static size_t GetSizeA(const Arguments &args) { return args.m * args.n; } - static size_t GetSizeB(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeC(const Arguments &) { return 1; } // N/A for this kernel - static size_t GetSizeTemp(const Arguments &) { return 1; } // N/A for this kernel - - // Sets the tuning parameters and their possible values - static void SetParameters(cltune::Tuner &tuner, const size_t id) { - tuner.AddParameter(id, "WGS1", {4, 8, 16, 32, 64, 128, 256, 512}); - tuner.AddParameter(id, "WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - } - // Sets the constraints and local memory size static void SetConstraints(cltune::Tuner &, const size_t) { } static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } - // Sets the base thread configuration - static std::vector GlobalSize(const Arguments &args) { return {args.m, args.n}; } - static std::vector GlobalSizeRef(const Arguments &args) { return GlobalSize(args); } - static std::vector LocalSize() { return {1, 1}; } - static std::vector LocalSizeRef() { return {8, 8}; } - - // Transforms the thread configuration based on the parameters - using TransformVector = std::vector>; - static TransformVector MulLocal() { return {{"WGS1", "WGS2"}}; } - static TransformVector DivLocal() { return {}; } - static TransformVector MulGlobal() { return {}; } - static TransformVector DivGlobal() { return {{"WPT", "WPT"}}; } - // Sets the kernel's arguments static void SetArguments(cltune::Tuner &tuner, const Arguments &args, std::vector &x_vec, std::vector &y_vec, @@ -107,17 +102,6 @@ class TuneXger { tuner.AddArgumentScalar(static_cast(args.m)); // a_ld tuner.AddArgumentScalar(0); // a_is_rowmajor } - - // Describes how to compute the performance metrics - static size_t GetMetric(const Arguments &args) { - return (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); - } - static std::string PerformanceUnit() { return "GB/s"; } - - // Returns which Heuristic to run - static size_t GetHeuristic(const Arguments &args){ - return static_cast (cltune::SearchMethod::FullSearch); - } }; // ================================================================================================= -- cgit v1.2.3