From 249bdaa8e9a111573f5c3a821230bba6437817c7 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Mon, 18 Dec 2017 21:34:07 +0100 Subject: Reformatted tuning code to make compilation faster --- src/tuning/kernels/copy_fast.cpp | 140 ++++++++-------- src/tuning/kernels/copy_pad.cpp | 156 +++++++++--------- src/tuning/kernels/transpose_fast.cpp | 140 ++++++++-------- src/tuning/kernels/transpose_pad.cpp | 154 +++++++++--------- src/tuning/kernels/xaxpy.cpp | 142 ++++++++--------- src/tuning/kernels/xdot.cpp | 142 ++++++++--------- src/tuning/kernels/xgemm.cpp | 270 ++++++++++++++++--------------- src/tuning/kernels/xgemm_direct.cpp | 266 +++++++++++++++---------------- src/tuning/kernels/xgemv.cpp | 216 +++++++++++++------------ src/tuning/kernels/xger.cpp | 158 +++++++++--------- src/tuning/tuning.cpp | 288 +++++++++++++++++++++++++++++++++ src/tuning/tuning.hpp | 290 ++-------------------------------- 12 files changed, 1177 insertions(+), 1185 deletions(-) diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 462107d3..d046c9e9 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -20,78 +20,74 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneCopy { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha}; - settings.default_m = 1024; - settings.default_n = 1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "copy"; - settings.kernel_name = "CopyMatrixFast"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "copy"; + settings.kernel_name = "CopyMatrixFast"; + settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_fast.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.n; - settings.size_b = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3}; - settings.outputs = {3}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}}; - settings.div_global = {{"COPY_VW", "COPY_WPT"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"COPY_DIMX", {8, 16, 32}}, - {"COPY_DIMY", {8, 16, 32}}, - {"COPY_WPT", {1, 2, 4, 8}}, - {"COPY_VW", {1, 2, 4, 8}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"COPY_DIMX", "COPY_DIMY"}}; + settings.div_global = {{"COPY_VW", "COPY_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"COPY_DIMX", {8, 16, 32}}, + {"COPY_DIMY", {8, 16, 32}}, + {"COPY_WPT", {1, 2, 4, 8}}, + {"COPY_VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, buffers[2]()); // 2 == A matrix - kernel.SetArgument(2, buffers[3]()); // 3 == B matrix - kernel.SetArgument(3, GetRealArg(args.alpha)); - } -}; +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); +} // ================================================================================================= } // namespace clblast @@ -105,11 +101,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 24557517..1b483e86 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -20,86 +20,82 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TunePad { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha}; - settings.default_m = 1024; - settings.default_n = 1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "pad"; - settings.kernel_name = "CopyPadMatrix"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "pad"; + settings.kernel_name = "CopyPadMatrix"; + settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_pad.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.n; - settings.size_b = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3}; - settings.outputs = {3}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}}; - settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"PAD_DIMX", {8, 16, 32}}, - {"PAD_DIMY", {8, 16, 32}}, - {"PAD_WPTX", {1, 2, 4}}, - {"PAD_WPTY", {1, 2, 4}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PAD_DIMX", "PAD_DIMY"}}; + settings.div_global = {{"PAD_WPTX", "PAD_WPTY"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PAD_DIMX", {8, 16, 32}}, + {"PAD_DIMY", {8, 16, 32}}, + {"PAD_WPTX", {1, 2, 4}}, + {"PAD_WPTY", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, static_cast(args.m)); - kernel.SetArgument(3, 0); - kernel.SetArgument(4, buffers[2]()); // 2 == A matrix - kernel.SetArgument(5, static_cast(args.m)); - kernel.SetArgument(6, static_cast(args.n)); - kernel.SetArgument(7, static_cast(args.m)); - kernel.SetArgument(8, 0); - kernel.SetArgument(9, buffers[3]()); // 3 == B matrix - kernel.SetArgument(10, GetRealArg(args.alpha)); - kernel.SetArgument(11, 0); - } -}; +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast(args.m)); + kernel.SetArgument(6, static_cast(args.n)); + kernel.SetArgument(7, static_cast(args.m)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); +} // ================================================================================================= } // namespace clblast @@ -113,11 +109,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 1e0d3c7b..5b701a5b 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -20,78 +20,74 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneTranspose { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha}; - settings.default_m = 1024; - settings.default_n = 1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "transpose"; - settings.kernel_name = "TransposeMatrixFast"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "transpose"; + settings.kernel_name = "TransposeMatrixFast"; + settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_fast.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.n; - settings.size_b = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3}; - settings.outputs = {3}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"TRA_DIM", "TRA_DIM"}}; - settings.div_global = {{"TRA_WPT", "TRA_WPT"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"TRA_DIM", {4, 8, 16, 32, 64}}, - {"TRA_WPT", {1, 2, 4, 8, 16}}, - {"TRA_PAD", {0, 1}}, - {"TRA_SHUFFLE", {0, 1}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"TRA_DIM", "TRA_DIM"}}; + settings.div_global = {{"TRA_WPT", "TRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"TRA_DIM", {4, 8, 16, 32, 64}}, + {"TRA_WPT", {1, 2, 4, 8, 16}}, + {"TRA_PAD", {0, 1}}, + {"TRA_SHUFFLE", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, buffers[2]()); // 2 == A matrix - kernel.SetArgument(2, buffers[3]()); // 3 == B matrix - kernel.SetArgument(3, GetRealArg(args.alpha)); - } -}; +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); +} // ================================================================================================= } // namespace clblast @@ -105,11 +101,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index 087f8e67..ed24fb04 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -20,85 +20,81 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TunePadTranspose { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha}; - settings.default_m = 1024; - settings.default_n = 1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "padtranspose"; - settings.kernel_name = "TransposePadMatrix"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "padtranspose"; + settings.kernel_name = "TransposePadMatrix"; + settings.sources = #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.n; - settings.size_b = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3}; - settings.outputs = {3}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}}; - settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"PADTRA_TILE", {8, 16, 32, 64}}, - {"PADTRA_WPT", {1, 2, 4, 8, 16}}, - {"PADTRA_PAD", {0, 1}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_a = args.m * args.n; + settings.size_b = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"PADTRA_TILE", "PADTRA_TILE"}}; + settings.div_global = {{"PADTRA_WPT", "PADTRA_WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"PADTRA_TILE", {8, 16, 32, 64}}, + {"PADTRA_WPT", {1, 2, 4, 8, 16}}, + {"PADTRA_PAD", {0, 1}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, static_cast(args.m)); - kernel.SetArgument(3, 0); - kernel.SetArgument(4, buffers[2]()); // 2 == A matrix - kernel.SetArgument(5, static_cast(args.n)); - kernel.SetArgument(6, static_cast(args.m)); - kernel.SetArgument(7, static_cast(args.n)); - kernel.SetArgument(8, 0); - kernel.SetArgument(9, buffers[3]()); // 3 == B matrix - kernel.SetArgument(10, GetRealArg(args.alpha)); - kernel.SetArgument(11, 0); - } -}; +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast(args.n)); + kernel.SetArgument(6, static_cast(args.m)); + kernel.SetArgument(7, static_cast(args.n)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); +} // ================================================================================================= } // namespace clblast @@ -112,11 +108,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index d843ea78..dd44018c 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -20,80 +20,76 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXaxpy { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgN, kArgAlpha}; - settings.default_n = 4096*1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgN, kArgAlpha}; + settings.default_n = 4096*1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "xaxpy"; - settings.kernel_name = "XaxpyFastest"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "xaxpy"; + settings.kernel_name = "XaxpyFastest"; + settings.sources = #include "../src/kernels/level1/level1.opencl" #include "../src/kernels/level1/xaxpy.opencl" - ; - - // Buffer sizes - settings.size_x = args.n; - settings.size_y = args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {0, 1}; - settings.outputs = {1}; - - // Sets the base thread configuration - settings.global_size = {args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1}; - settings.local_size_ref = {64}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"WGS"}}; - settings.div_global = {{"WPT"},{"VW"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"WGS", {64, 128, 256, 512, 1024, 2048}}, - {"WPT", {1, 2, 4, 8}}, - {"VW", {1, 2, 4, 8}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = 3 * args.n * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1}; + settings.outputs = {1}; + + // Sets the base thread configuration + settings.global_size = {args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"}}; + settings.div_global = {{"WPT"},{"VW"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS", {64, 128, 256, 512, 1024, 2048}}, + {"WPT", {1, 2, 4, 8}}, + {"VW", {1, 2, 4, 8}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 3 * args.n * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &args) { - if (!IsMultiple(args.n, 64)) { - throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); - } - } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.n)); - kernel.SetArgument(1, GetRealArg(args.alpha)); - kernel.SetArgument(2, buffers[0]()); // 0 == X vector - kernel.SetArgument(3, buffers[1]()); // 1 == Y vector +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &args) { + if (!IsMultiple(args.n, 64)) { + throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } -}; +} +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.n)); + kernel.SetArgument(1, GetRealArg(args.alpha)); + kernel.SetArgument(2, buffers[0]()); // 0 == X vector + kernel.SetArgument(3, buffers[1]()); // 1 == Y vector +} // ================================================================================================= } // namespace clblast @@ -107,11 +103,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index 12350657..635d012a 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -21,86 +21,82 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXdot { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgN}; - settings.default_n = 2*1024*1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgN}; + settings.default_n = 2*1024*1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int V, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "xdot_"+std::to_string(V); - settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "xdot_"+std::to_string(V); + settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; + settings.sources = #include "../src/kernels/level1/xdot.opencl" - ; + ; - // Buffer sizes - settings.size_x = args.n; - settings.size_y = args.n; - settings.size_temp = args.n; // Worst case + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.n; + settings.size_temp = args.n; // Worst case - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {0, 1, 5}; - settings.outputs = {}; // no output checking + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 5}; + settings.outputs = {}; // no output checking - // Sets the base thread configuration - settings.global_size = (V==1) ? std::vector{2*64} : std::vector{1}; - settings.global_size_ref = (V==1) ? std::vector{2*64*64} : std::vector{64}; - settings.local_size = {1}; - settings.local_size_ref = {64}; + // Sets the base thread configuration + settings.global_size = (V==1) ? std::vector{2*64} : std::vector{1}; + settings.global_size_ref = (V==1) ? std::vector{2*64*64} : std::vector{64}; + settings.local_size = {1}; + settings.local_size_ref = {64}; - // Transforms the thread configuration based on the parameters - settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; - settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; + // Transforms the thread configuration based on the parameters + settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; + settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; - // Sets the tuning parameters and their possible values - settings.parameters = { - {"WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}}, - }; + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256, 512, 1024}}, + }; - // Describes how to compute the performance metrics - settings.metric_amount = (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); - settings.performance_unit = (V==1) ? "GB/s" : "N/A"; + // Describes how to compute the performance metrics + settings.metric_amount = (V==1) ? (2*args.n + 1) * GetBytes(args.precision) : 1 * GetBytes(args.precision); + settings.performance_unit = (V==1) ? "GB/s" : "N/A"; - return settings; - } + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - if (V == 1) { - kernel.SetArgument(0, static_cast(args.n)); - kernel.SetArgument(1, buffers[0]()); // 0 == X vector - kernel.SetArgument(2, 0); - kernel.SetArgument(3, 1); - kernel.SetArgument(4, buffers[1]()); // 1 == Y vector - kernel.SetArgument(5, 0); - kernel.SetArgument(6, 1); - kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies - kernel.SetArgument(8, static_cast(false)); - } - else { - kernel.SetArgument(0, buffers[5]()); // 5 == temp - kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies - kernel.SetArgument(2, 0); - } +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int V, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + if (V == 1) { + kernel.SetArgument(0, static_cast(args.n)); + kernel.SetArgument(1, buffers[0]()); // 0 == X vector + kernel.SetArgument(2, 0); + kernel.SetArgument(3, 1); + kernel.SetArgument(4, buffers[1]()); // 1 == Y vector + kernel.SetArgument(5, 0); + kernel.SetArgument(6, 1); + kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies + kernel.SetArgument(8, static_cast(false)); + } + else { + kernel.SetArgument(0, buffers[5]()); // 5 == temp + kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies + kernel.SetArgument(2, 0); } -}; +} // ================================================================================================= } // namespace clblast @@ -115,11 +111,11 @@ template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } } diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index d38ce077..b25ba302 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -22,148 +22,144 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXgemm { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - settings.default_m = 1024; - settings.default_n = 1024; - settings.default_k = 1024; - settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly - settings.default_num_runs = 2; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int V) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 1024; + settings.default_n = 1024; + settings.default_k = 1024; + settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly + settings.default_num_runs = 2; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int V, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; - settings.kernel_name = "Xgemm"; - settings.sources = + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; + settings.kernel_name = "Xgemm"; + settings.sources = #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" #include "../src/kernels/level3/xgemm_part4.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.k; - settings.size_b = args.n * args.k; - settings.size_c = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3, 4}; - settings.outputs = {4}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"MDIMC", "NDIMC"}}; - settings.mul_global = {{"MDIMC", "NDIMC"}}; - settings.div_global = {{"MWG", "NWG"}}; - - // Sets the tuning parameters and their possible values - if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - settings.parameters = { - {"MWG", {16, 32, 64}}, - {"NWG", {16, 32, 64}}, - {"KWG", {32}}, - {"MDIMC", {8, 16, 32}}, - {"NDIMC", {8, 16, 32}}, - {"MDIMA", {8, 16, 32}}, - {"NDIMB", {8, 16, 32}}, - {"KWI", {2}}, - {"VWM", {1, 2, 4}}, - {"VWN", {1, 2, 4}}, - {"STRM", {0}}, - {"STRN", {0}}, - {"SA", {0, 1}}, - {"SB", {0, 1}}, - }; - } - else { // a lot more tuning parameters - has to be sampled randomly, too much to test all - settings.parameters = { - {"MWG", {16, 32, 64, 128}}, - {"NWG", {16, 32, 64, 128}}, - {"KWG", {16, 32}}, - {"MDIMC", {8, 16, 32}}, - {"NDIMC", {8, 16, 32}}, - {"MDIMA", {8, 16, 32}}, - {"NDIMB", {8, 16, 32}}, - {"KWI", {2}}, - {"VWM", {1, 2, 4, 8}}, - {"VWN", {1, 2, 4, 8}}, - {"STRM", {0, 1}}, - {"STRN", {0, 1}}, - {"SA", {0, 1}}, - {"SB", {0, 1}}, - }; - } - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * args.k; - settings.performance_unit = "GFLOPS"; - - return settings; + ; + + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMC", "NDIMC"}}; + settings.mul_global = {{"MDIMC", "NDIMC"}}; + settings.div_global = {{"MWG", "NWG"}}; + + // Sets the tuning parameters and their possible values + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + settings.parameters = { + {"MWG", {16, 32, 64}}, + {"NWG", {16, 32, 64}}, + {"KWG", {32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4}}, + {"VWN", {1, 2, 4}}, + {"STRM", {0}}, + {"STRN", {0}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { - auto constraints = std::vector(); - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; - auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; - // Requirement for unrolling the KWG loop - constraints.push_back({MultipleOfX, {"KWG", "KWI"}}); - // Required for integer MWI and NWI - constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}}); - constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}}); - // Required for integer MWIA and NWIB - constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}}); - constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}}); - // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) - constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}}); - constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}}); - - // Extra constraints for variation 1 to limit the set of options significantly - if (V==1) { - auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; - constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); - constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); - constraints.push_back({IsEqual, {"SA", "SB"}}); - } - return constraints; + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"MWG", {16, 32, 64, 128}}, + {"NWG", {16, 32, 64, 128}}, + {"KWG", {16, 32}}, + {"MDIMC", {8, 16, 32}}, + {"NDIMC", {8, 16, 32}}, + {"MDIMA", {8, 16, 32}}, + {"NDIMB", {8, 16, 32}}, + {"KWI", {2}}, + {"VWM", {1, 2, 4, 8}}, + {"VWN", {1, 2, 4, 8}}, + {"STRM", {0, 1}}, + {"STRN", {0, 1}}, + {"SA", {0, 1}}, + {"SB", {0, 1}}, + }; } - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, static_cast(args.k)); - kernel.SetArgument(3, GetRealArg(args.alpha)); - kernel.SetArgument(4, GetRealArg(args.beta)); - kernel.SetArgument(5, buffers[2]()); // 2 == A matrix - kernel.SetArgument(6, buffers[3]()); // 3 == B matrix - kernel.SetArgument(7, buffers[4]()); // 4 == C matrix - kernel.SetArgument(8, 0); - kernel.SetArgument(9, 0); + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + return settings; +} + +// Tests for valid arguments +template +void TestValidArguments(const int V, const Arguments &) { } +std::vector SetConstraints(const int V) { + auto constraints = std::vector(); + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the KWG loop + constraints.push_back({MultipleOfX, {"KWG", "KWI"}}); + // Required for integer MWI and NWI + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}}); + // Required for integer MWIA and NWIB + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}}); + // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; + constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); + constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); + constraints.push_back({IsEqual, {"SA", "SB"}}); } -}; + return constraints; +} + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, buffers[3]()); // 3 == B matrix + kernel.SetArgument(7, buffers[4]()); // 4 == C matrix + kernel.SetArgument(8, 0); + kernel.SetArgument(9, 0); +} // ================================================================================================= } // namespace clblast @@ -178,11 +174,11 @@ template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } } diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 60a983b4..0bd2e94d 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -22,145 +22,141 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXgemmDirect { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, - kArgHeuristicSelection, kArgPsoSwarmSize, - kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; - settings.default_m = 256; - settings.default_n = 256; - settings.default_k = 256; - settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly - settings.default_num_runs = 4; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int V) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction, + kArgHeuristicSelection, kArgPsoSwarmSize, + kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom}; + settings.default_m = 256; + settings.default_n = 256; + settings.default_k = 256; + settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly + settings.default_num_runs = 4; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int V, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; - settings.kernel_name = "XgemmDirectTN"; - settings.sources = + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; + settings.kernel_name = "XgemmDirectTN"; + settings.sources = #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" - ; - - // Buffer sizes - settings.size_a = args.m * args.k; - settings.size_b = args.n * args.k; - settings.size_c = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {2, 3, 4}; - settings.outputs = {4}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"MDIMCD", "NDIMCD"}}; - settings.mul_global = {{"MDIMCD", "NDIMCD"}}; - settings.div_global = {{"WGD", "WGD"}}; - - // Sets the tuning parameters and their possible values - if (V==1) { // limited subset of tuning parameters - but explorable exhaustively - settings.parameters = { - {"WGD", {8, 16, 32}}, - {"MDIMCD", {8, 16, 32}}, - {"NDIMCD", {8, 16, 32}}, - {"MDIMAD", {8, 16, 32}}, - {"NDIMBD", {8, 16, 32}}, - {"KWID", {2}}, - {"VWMD", {1, 2, 4, 8}}, - {"VWND", {1, 2, 4, 8}}, - {"PADA", {1}}, - {"PADB", {1}}, - }; - } - else { // a lot more tuning parameters - has to be sampled randomly, too much to test all - settings.parameters = { - {"WGD", {8, 16, 32, 64}}, - {"MDIMCD", {8, 16, 32}}, - {"NDIMCD", {8, 16, 32}}, - {"MDIMAD", {8, 16, 32}}, - {"NDIMBD", {8, 16, 32}}, - {"KWID", {2, 8, 16}}, - {"VWMD", {1, 2, 4, 8}}, - {"VWND", {1, 2, 4, 8}}, - {"PADA", {0, 1}}, - {"PADB", {0, 1}}, - }; - } - - // Describes how to compute the performance metrics - settings.metric_amount = 2 * args.m * args.n * args.k; - settings.performance_unit = "GFLOPS"; - - return settings; + ; + + // Buffer sizes + settings.size_a = args.m * args.k; + settings.size_b = args.n * args.k; + settings.size_c = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"MDIMCD", "NDIMCD"}}; + settings.mul_global = {{"MDIMCD", "NDIMCD"}}; + settings.div_global = {{"WGD", "WGD"}}; + + // Sets the tuning parameters and their possible values + if (V==1) { // limited subset of tuning parameters - but explorable exhaustively + settings.parameters = { + {"WGD", {8, 16, 32}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {1}}, + {"PADB", {1}}, + }; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { - auto constraints = std::vector(); - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; - auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; - // Requirement for unrolling the WGD loop - constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); - // Required for integer MWID and NWID - constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); - constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); - // Required for integer MWIAD and NWIBD - constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); - constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); - // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) - constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); - constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); - - // Extra constraints for variation 1 to limit the set of options significantly - if (V==1) { - auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; - constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}}); - constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}}); - } - return constraints; + else { // a lot more tuning parameters - has to be sampled randomly, too much to test all + settings.parameters = { + {"WGD", {8, 16, 32, 64}}, + {"MDIMCD", {8, 16, 32}}, + {"NDIMCD", {8, 16, 32}}, + {"MDIMAD", {8, 16, 32}}, + {"NDIMBD", {8, 16, 32}}, + {"KWID", {2, 8, 16}}, + {"VWMD", {1, 2, 4, 8}}, + {"VWND", {1, 2, 4, 8}}, + {"PADA", {0, 1}}, + {"PADB", {0, 1}}, + }; } - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, static_cast(args.k)); - kernel.SetArgument(3, GetRealArg(args.alpha)); - kernel.SetArgument(4, GetRealArg(args.beta)); - kernel.SetArgument(5, buffers[2]()); // 2 == A matrix - kernel.SetArgument(6, 0); // a_offset - kernel.SetArgument(7, static_cast(args.k)); // a_ld - kernel.SetArgument(8, buffers[3]()); // 3 == B matrix - kernel.SetArgument(9, 0); // b_offset - kernel.SetArgument(10, static_cast(args.n)); // b_ld - kernel.SetArgument(11, buffers[4]()); // 4 == C matrix - kernel.SetArgument(12, 0); // c_offset - kernel.SetArgument(13, static_cast(args.n)); // c_ld - kernel.SetArgument(14, 1); // c_do_transpose - kernel.SetArgument(15, 0); // a_conjugate - kernel.SetArgument(16, 0); // b_conjugate + // Describes how to compute the performance metrics + settings.metric_amount = 2 * args.m * args.n * args.k; + settings.performance_unit = "GFLOPS"; + + return settings; +} + +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int V) { + auto constraints = std::vector(); + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; + auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; + // Requirement for unrolling the WGD loop + constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); + // Required for integer MWID and NWID + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); + // Required for integer MWIAD and NWIBD + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); + // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); + + // Extra constraints for variation 1 to limit the set of options significantly + if (V==1) { + auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; + constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}}); + constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}}); } -}; + return constraints; +} + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); // a_offset + kernel.SetArgument(7, static_cast(args.k)); // a_ld + kernel.SetArgument(8, buffers[3]()); // 3 == B matrix + kernel.SetArgument(9, 0); // b_offset + kernel.SetArgument(10, static_cast(args.n)); // b_ld + kernel.SetArgument(11, buffers[4]()); // 4 == C matrix + kernel.SetArgument(12, 0); // c_offset + kernel.SetArgument(13, static_cast(args.n)); // c_ld + kernel.SetArgument(14, 1); // c_do_transpose + kernel.SetArgument(15, 0); // a_conjugate + kernel.SetArgument(16, 0); // b_conjugate +} // ================================================================================================= } // namespace clblast @@ -175,11 +171,11 @@ template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } } diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 3eadd32b..c2ee1df3 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -23,119 +23,115 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXgemv { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha, kArgBeta}; - settings.default_m = 2048; - settings.default_n = 2048; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha, kArgBeta}; + settings.default_m = 2048; + settings.default_n = 2048; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int V, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); - settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); - settings.sources = + // Identification of the kernel + settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); + settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); + settings.sources = #include "../src/kernels/level2/xgemv.opencl" #include "../src/kernels/level2/xgemv_fast.opencl" - ; - - // Buffer sizes - settings.size_x = args.n; - settings.size_y = args.m; - settings.size_a = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {0, 1, 2}; - settings.outputs = {1}; - - // Sets the base thread configuration - settings.global_size = {args.m}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1}; - settings.local_size_ref = {64}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"WGS"+std::to_string(V)}}; - settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{}; - - // Sets the tuning parameters and their possible values - if (V==1) { - settings.parameters = { - {"WGS"+std::to_string(V), {32, 64, 128, 256}}, - {"WPT"+std::to_string(V), {1, 2, 4}}, - }; - } - if (V==2) { - settings.parameters = { - {"WGS"+std::to_string(V), {16, 32, 64, 128, 256}}, - {"WPT"+std::to_string(V), {1, 2, 4}}, - {"VW"+std::to_string(V), {1, 2, 4, 8}}, - }; - } - if (V==3) { - settings.parameters = { - {"WGS"+std::to_string(V), {16, 32, 64, 128}}, - {"WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}}, - {"VW"+std::to_string(V), {1, 2, 4, 8}}, - }; - } - - // Describes how to compute the performance metrics - settings.metric_amount = (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; + ; + + // Buffer sizes + settings.size_x = args.n; + settings.size_y = args.m; + settings.size_a = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {1}; + + // Sets the base thread configuration + settings.global_size = {args.m}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1}; + settings.local_size_ref = {64}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS"+std::to_string(V)}}; + settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{}; + + // Sets the tuning parameters and their possible values + if (V==1) { + settings.parameters = { + {"WGS"+std::to_string(V), {32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + }; } - - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { - auto constraints = std::vector(); - if (V==2 || V==3) { - auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}}); - } - if (V==3) { - auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; - constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}}); - } - return constraints; + if (V==2) { + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128, 256}}, + {"WPT"+std::to_string(V), {1, 2, 4}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; } + if (V==3) { + settings.parameters = { + {"WGS"+std::to_string(V), {16, 32, 64, 128}}, + {"WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}}, + {"VW"+std::to_string(V), {1, 2, 4, 8}}, + }; + } + + // Describes how to compute the performance metrics + settings.metric_amount = (args.m*args.n + 2*args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - auto a_rotated = (V==3) ? 1 : 0; - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, GetRealArg(args.alpha)); - kernel.SetArgument(3, GetRealArg(args.beta)); - kernel.SetArgument(4, a_rotated); - kernel.SetArgument(5, buffers[2]()); // 2 == A matrix - kernel.SetArgument(6, 0); - kernel.SetArgument(7, static_cast(args.m)); - kernel.SetArgument(8, buffers[0]()); // 0 == X vector - kernel.SetArgument(9, 0); - kernel.SetArgument(10, 1); - kernel.SetArgument(11, buffers[1]()); // 1 == Y vector - kernel.SetArgument(12, 0); - kernel.SetArgument(13, 1); - kernel.SetArgument(14, 0); // Conjugate transpose - kernel.SetArgument(15, 0); // Additional parameter - kernel.SetArgument(16, 0); // Banded 'kl' - kernel.SetArgument(17, 0); // Banded 'ku' + return settings; +} + +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int V) { + auto constraints = std::vector(); + if (V==2 || V==3) { + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}}); + } + if (V==3) { + auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; + constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}}); } -}; + return constraints; +} + +// Sets the kernel's arguments +template +void SetArguments(const int V, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + auto a_rotated = (V==3) ? 1 : 0; + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, GetRealArg(args.beta)); + kernel.SetArgument(4, a_rotated); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); + kernel.SetArgument(7, static_cast(args.m)); + kernel.SetArgument(8, buffers[0]()); // 0 == X vector + kernel.SetArgument(9, 0); + kernel.SetArgument(10, 1); + kernel.SetArgument(11, buffers[1]()); // 1 == Y vector + kernel.SetArgument(12, 0); + kernel.SetArgument(13, 1); + kernel.SetArgument(14, 0); // Conjugate transpose + kernel.SetArgument(15, 0); // Additional parameter + kernel.SetArgument(16, 0); // Banded 'kl' + kernel.SetArgument(17, 0); // Banded 'ku' +} // ================================================================================================= } // namespace clblast @@ -150,11 +146,11 @@ template void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, V, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } } diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 745e553f..a88fb5d6 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -20,87 +20,83 @@ namespace clblast { // ================================================================================================= -// See comment at top of file for a description of the class -template -class TuneXger { - public: - - // Settings for this kernel (default command-line arguments) - static TunerDefaults GetTunerDefaults() { - auto settings = TunerDefaults(); - settings.options = {kArgM, kArgN, kArgAlpha}; - settings.default_m = 1024; - settings.default_n = 1024; - return settings; - } +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgM, kArgN, kArgAlpha}; + settings.default_m = 1024; + settings.default_n = 1024; + return settings; +} - // Settings for this kernel (general) - static TunerSettings GetTunerSettings(const Arguments &args) { - auto settings = TunerSettings(); +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); - // Identification of the kernel - settings.kernel_family = "xger"; - settings.kernel_name = "Xger"; - settings.sources = + // Identification of the kernel + settings.kernel_family = "xger"; + settings.kernel_name = "Xger"; + settings.sources = #include "../src/kernels/level2/level2.opencl" #include "../src/kernels/level2/xger.opencl" - ; - - // Buffer sizes - settings.size_x = args.m; - settings.size_y = args.n; - settings.size_a = args.m * args.n; - - // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) - settings.inputs = {0, 1, 2}; - settings.outputs = {2}; - - // Sets the base thread configuration - settings.global_size = {args.m, args.n}; - settings.global_size_ref = settings.global_size; - settings.local_size = {1, 1}; - settings.local_size_ref = {8, 8}; - - // Transforms the thread configuration based on the parameters - settings.mul_local = {{"WGS1", "WGS2"}}; - settings.div_global = {{"WPT", "WPT"}}; - - // Sets the tuning parameters and their possible values - settings.parameters = { - {"WGS1", {4, 8, 16, 32, 64, 128, 256, 512}}, - {"WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}}, - {"WPT", {1, 2, 4}}, - }; - - // Describes how to compute the performance metrics - settings.metric_amount = (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); - settings.performance_unit = "GB/s"; - - return settings; - } + ; + + // Buffer sizes + settings.size_x = args.m; + settings.size_y = args.n; + settings.size_a = args.m * args.n; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {2}; + + // Sets the base thread configuration + settings.global_size = {args.m, args.n}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {8, 8}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"WGS1", "WGS2"}}; + settings.div_global = {{"WPT", "WPT"}}; + + // Sets the tuning parameters and their possible values + settings.parameters = { + {"WGS1", {4, 8, 16, 32, 64, 128, 256, 512}}, + {"WGS2", {1, 2, 4, 8, 16, 32, 64, 128, 256}}, + {"WPT", {1, 2, 4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = (2*args.m*args.n + args.m + args.n) * GetBytes(args.precision); + settings.performance_unit = "GB/s"; + + return settings; +} - // Tests for valid arguments - static void TestValidArguments(const Arguments &) { } - static std::vector SetConstraints() { return {}; } - - // Sets the kernel's arguments - static void SetArguments(Kernel &kernel, const Arguments &args, - std::vector>& buffers) { - kernel.SetArgument(0, static_cast(args.m)); - kernel.SetArgument(1, static_cast(args.n)); - kernel.SetArgument(2, GetRealArg(args.alpha)); - kernel.SetArgument(3, buffers[0]()); // 0 == X vector - kernel.SetArgument(4, 0); // x_offset - kernel.SetArgument(5, 1); // x_increment - kernel.SetArgument(6, buffers[1]()); // 1 == Y vector - kernel.SetArgument(7, 0); // y_offset - kernel.SetArgument(8, 1); // y_increment - kernel.SetArgument(9, buffers[2]()); // 2 == A matrix - kernel.SetArgument(10, 0); // a_offset - kernel.SetArgument(11, static_cast(args.m)); // a_ld - kernel.SetArgument(12, 0); // a_is_rowmajor - } -}; +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &) { } +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, buffers[0]()); // 0 == X vector + kernel.SetArgument(4, 0); // x_offset + kernel.SetArgument(5, 1); // x_increment + kernel.SetArgument(6, buffers[1]()); // 1 == Y vector + kernel.SetArgument(7, 0); // y_offset + kernel.SetArgument(8, 1); // y_increment + kernel.SetArgument(9, buffers[2]()); // 2 == A matrix + kernel.SetArgument(10, 0); // a_offset + kernel.SetArgument(11, static_cast(args.m)); // a_ld + kernel.SetArgument(12, 0); // a_is_rowmajor +} // ================================================================================================= } // namespace clblast @@ -114,11 +110,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner, half>(argc, argv); break; - case clblast::Precision::kSingle: clblast::Tuner, float>(argc, argv); break; - case clblast::Precision::kDouble: clblast::Tuner, double>(argc, argv); break; - case clblast::Precision::kComplexSingle: clblast::Tuner, float2>(argc, argv); break; - case clblast::Precision::kComplexDouble: clblast::Tuner, double2>(argc, argv); break; + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; } return 0; } diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp index 935ab257..c8532b36 100644 --- a/src/tuning/tuning.cpp +++ b/src/tuning/tuning.cpp @@ -85,5 +85,293 @@ void print_separator(const size_t parameters_size) { printf("-x----------------x--------------x--------x-------------------x\n"); } +// ================================================================================================= + +template +void Tuner(int argc, char* argv[], const int V, + GetTunerDefaultsFunc GetTunerDefaults, + GetTunerSettingsFunc GetTunerSettings, + TestValidArgumentsFunc TestValidArguments, + SetConstraintsFunc SetConstraints, + SetArgumentsFunc SetArguments) { + constexpr auto kSeed = 42; // fixed seed for reproducibility + + // Sets the parameters and platform/device for which to tune (command-line options) + const TunerDefaults defaults = GetTunerDefaults(V); + auto command_line_args = RetrieveCommandLineArguments(argc, argv); + auto help = std::string{"* Options given/available:\n"}; + auto args = Arguments{}; + args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); + args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); + args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); + for (auto &o: defaults.options) { + if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); } + if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); } + if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } + if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar()); } + if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar()); } + if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } + } + args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); + args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); + const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4); + printf("%s\n", help.c_str()); + const TunerSettings settings = GetTunerSettings(V, args); + + // Tests validity of the given arguments + TestValidArguments(V, args); + + // Initializes OpenCL + const auto platform = Platform(args.platform_id); + const auto device = Device(platform, args.device_id); + const auto context = Context(device); + + // Tests for validity of the precision and retrieves properties + if (!PrecisionSupported(device)) { + printf("* Unsupported precision, skipping this tuning run\n\n"); + return; + } + const auto device_type = GetDeviceType(device); + const auto device_vendor = GetDeviceVendor(device); + const auto device_architecture = GetDeviceArchitecture(device); + const auto device_name = GetDeviceName(device); + + // Creates input buffers with random data + const auto buffer_sizes = std::vector{ + settings.size_x, settings.size_y, + settings.size_a, settings.size_b, settings.size_c, + settings.size_temp + }; + std::mt19937 mt(kSeed); + std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); + auto source_buffers = std::vector>(); + auto reference_buffers = std::vector>(); + auto result_buffers = std::vector>(); + auto device_buffers = std::vector>(); + for (const auto size : buffer_sizes) { + auto host_buffer = std::vector(size); + PopulateVector(host_buffer, mt, dist); + source_buffers.push_back(host_buffer); + reference_buffers.push_back(std::vector(size)); + result_buffers.push_back(std::vector(size)); + device_buffers.push_back(Buffer(context, size)); + } + + // Sets the tunable parameters and their possible values + auto configurations = SetConfigurations(settings.parameters, SetConstraints(V)); + printf("* Found %s%zu configuration(s)%s\n", + kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); + + // Select the search method (full search or a random fraction) + if (args.fraction != 0.0 && args.fraction != 1.0) { + const auto new_size = static_cast(configurations.size() / args.fraction); + auto rng = std::default_random_engine{}; + std::shuffle(std::begin(configurations), std::end(configurations), rng); + configurations.resize(new_size); + printf("* Exploring a random subset of %s%zu configuration(s)%s\n", + kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); + } + + // Prints information about the parameters + printf("* Parameters explored: "); + for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); } + printf("\n"); + + // Prints the header of the table + printf("\n"); + printf("| ID | total |"); + for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } + printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); + print_separator(settings.parameters.size()); + + // First runs a reference example to compare against + try { + auto queue = Queue(context, device); + printf("| ref | - |"); + for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } + printf(" - |"); + + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Compiles the kernel + auto compiler_options = std::vector(); + const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, + device, context, compiler_options, 0); + auto kernel = Kernel(program, settings.kernel_name); + SetArguments(V, kernel, args, device_buffers); + printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str()); + + // Runs the kernel + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, + settings.global_size_ref, settings.local_size_ref); + printf(" - |"); + if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } + + // Saves the result + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); + } + printf(" %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + printf("* Exception caught with status %d while running the reference, aborting\n", + static_cast(status_code)); + return; + } + print_separator(settings.parameters.size()); + + // Starts the tuning process + auto results = std::vector(); + for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { + try { + auto queue = Queue(context, device); + + auto configuration = configurations[config_id]; + printf("| %4zu | %5zu |", config_id + 1, configurations.size()); + for (const auto& parameter : settings.parameters) { + printf("%5zu", configuration.at(parameter.first)); + } + printf(" |"); + + // Sets the input + for (const auto id : settings.inputs) { + device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); + } + + // Sets the thread configuration + const auto global = SetThreadConfiguration(configuration, settings.global_size, + settings.mul_global, settings.div_global); + const auto local = SetThreadConfiguration(configuration, settings.local_size, + settings.mul_local, settings.div_local); + + // Sets the parameters for this configuration + auto kernel_source = std::string{""}; + for (const auto ¶meter : configuration) { + kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; + } + kernel_source += settings.sources; + + // Compiles the kernel + const auto start_time = std::chrono::steady_clock::now(); + auto compiler_options = std::vector(); + const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, + device, context, compiler_options, 0, true); + auto kernel = Kernel(program, settings.kernel_name); + const auto elapsed_time = std::chrono::steady_clock::now() - start_time; + const auto timing = std::chrono::duration(elapsed_time).count(); + printf(" %sOK%s %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing); + + // Runs the kernel + SetArguments(V, kernel, args, device_buffers); + const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local); + + // Kernel run was not successful + if (time_ms == -1.0) { + printf(" - |"); + printf(" %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str()); + printf(" <-- skipping\n"); + continue; + } + + // Compares the results + auto l2_error = 0.0; + for (const auto id : settings.outputs) { + device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); + for (auto index = size_t{0}; index(buffer_sizes[id]); + if (std::isnan(l2_error) || l2_error > max_l2_norm) { + printf(" - |"); + printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str()); + throw std::runtime_error("L2 error too large"); + } + } + + // All was OK + configuration["PRECISION"] = static_cast(args.precision); + results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); + printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6)); + printf(" %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); + } + catch (CLCudaAPIBuildError) { + const auto status_code = DispatchExceptionCatchAll(true); + printf(" %scompilation error: %5d%s |", + kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); + printf(" - | - | <-- skipping\n"); + } + catch (...) { + const auto status_code = DispatchExceptionCatchAll(true); + if (status_code != StatusCode::kUnknownError) { + printf(" %serror code %d%s |", + kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); + } + printf(" <-- skipping\n"); + } + } + + // Completed the tuning process + print_separator(settings.parameters.size()); + printf("\n"); + if (results.size() == 0) { return; } + + // Computes the best results + auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; + const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); + const auto best_time_ms = best_configuration->score; + if (best_time_ms == 0.0) { return; } + + // Also prints the performance of the best-case in terms of GB/s or GFLOPS + printf("\n"); + printf("* Found best result %.2lf ms", best_time_ms); + printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6), + settings.performance_unit.c_str()); + printf("* Best parameters: "); + auto best_string = std::string{""}; + auto i = size_t{0}; + for (const auto config : best_configuration->config) { + best_string += "" + config.first + "=" + ToString(config.second); + if (i < best_configuration->config.size() - 1) { best_string += " "; } + ++i; + } + printf("%s\n\n", best_string.c_str()); + + // Outputs the results as JSON to disk, including some meta-data + auto precision_string = std::to_string(static_cast(args.precision)); + auto metadata = std::vector>{ + {"kernel_family", settings.kernel_family}, + {"precision", precision_string}, + {"best_kernel", best_configuration->name}, + {"best_time", ToString(best_configuration->score)}, + {"best_parameters", best_string} + }; + for (auto &o: defaults.options) { + if (o == kArgM) { metadata.push_back({"arg_m", ToString(args.m)}); } + if (o == kArgN) { metadata.push_back({"arg_n", ToString(args.n)}); } + if (o == kArgK) { metadata.push_back({"arg_k", ToString(args.k)}); } + if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } + if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } + if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } + } + PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", + device, platform, metadata, results); + + printf("* Completed tuning process\n"); + printf("\n"); +} + +// Compiles the above function +template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); +template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); +template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); +template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); +template void Tuner(int argc, char* argv[], const int V, GetTunerDefaultsFunc GetTunerDefaults, GetTunerSettingsFunc GetTunerSettings, TestValidArgumentsFunc TestValidArguments, SetConstraintsFunc SetConstraints, SetArgumentsFunc SetArguments); + // ================================================================================================= } // namespace clblast diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp index ac6968dc..22210c7d 100644 --- a/src/tuning/tuning.hpp +++ b/src/tuning/tuning.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "utilities/utilities.hpp" #include "utilities/compile.hpp" @@ -116,282 +117,25 @@ void print_separator(const size_t parameters_size); // ================================================================================================= +using GetTunerDefaultsFunc = std::function; +template +using GetTunerSettingsFunc = std::function &args)>; +template +using TestValidArgumentsFunc = std::function &args)>; +using SetConstraintsFunc = std::function(const int V)>; +template +using SetArgumentsFunc = std::function &args, std::vector>& buffers)>; + // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect // the results. Used for all types of kernel families. Note that this is a header-only function so // that it is automatically compiled for the various kernels (given as the 'C' template argument). -template -void Tuner(int argc, char* argv[]) { - constexpr auto kSeed = 42; // fixed seed for reproducibility - - // Sets the parameters and platform/device for which to tune (command-line options) - const TunerDefaults defaults = C::GetTunerDefaults(); - auto command_line_args = RetrieveCommandLineArguments(argc, argv); - auto help = std::string{"* Options given/available:\n"}; - auto args = Arguments{}; - args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); - args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); - args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); - for (auto &o: defaults.options) { - if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, defaults.default_m); } - if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, defaults.default_n); } - if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, defaults.default_k); } - if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar()); } - if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar()); } - if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, defaults.default_batch_count); } - } - args.fraction = GetArgument(command_line_args, help, kArgFraction, defaults.default_fraction); - args.num_runs = GetArgument(command_line_args, help, kArgNumRuns, defaults.default_num_runs); - const auto max_l2_norm = GetArgument(command_line_args, help, kArgMaxL2Norm, 1.0e-4); - printf("%s\n", help.c_str()); - const TunerSettings settings = C::GetTunerSettings(args); - - // Tests validity of the given arguments - C::TestValidArguments(args); - - // Initializes OpenCL - const auto platform = Platform(args.platform_id); - const auto device = Device(platform, args.device_id); - const auto context = Context(device); - - // Tests for validity of the precision and retrieves properties - if (!PrecisionSupported(device)) { - printf("* Unsupported precision, skipping this tuning run\n\n"); - return; - } - const auto device_type = GetDeviceType(device); - const auto device_vendor = GetDeviceVendor(device); - const auto device_architecture = GetDeviceArchitecture(device); - const auto device_name = GetDeviceName(device); - - // Creates input buffers with random data - const auto buffer_sizes = std::vector{ - settings.size_x, settings.size_y, - settings.size_a, settings.size_b, settings.size_c, - settings.size_temp - }; - std::mt19937 mt(kSeed); - std::uniform_real_distribution dist(kTestDataLowerLimit, kTestDataUpperLimit); - auto source_buffers = std::vector>(); - auto reference_buffers = std::vector>(); - auto result_buffers = std::vector>(); - auto device_buffers = std::vector>(); - for (const auto size : buffer_sizes) { - auto host_buffer = std::vector(size); - PopulateVector(host_buffer, mt, dist); - source_buffers.push_back(host_buffer); - reference_buffers.push_back(std::vector(size)); - result_buffers.push_back(std::vector(size)); - device_buffers.push_back(Buffer(context, size)); - } - - // Sets the tunable parameters and their possible values - auto configurations = SetConfigurations(settings.parameters, C::SetConstraints()); - printf("* Found %s%zu configuration(s)%s\n", - kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); - - // Select the search method (full search or a random fraction) - if (args.fraction != 0.0 && args.fraction != 1.0) { - const auto new_size = static_cast(configurations.size() / args.fraction); - auto rng = std::default_random_engine{}; - std::shuffle(std::begin(configurations), std::end(configurations), rng); - configurations.resize(new_size); - printf("* Exploring a random subset of %s%zu configuration(s)%s\n", - kPrintMessage.c_str(), configurations.size(), kPrintEnd.c_str()); - } - - // Prints information about the parameters - printf("* Parameters explored: "); - for (const auto& parameter : settings.parameters) { printf("%s ", parameter.first.c_str()); } - printf("\n"); - - // Prints the header of the table - printf("\n"); - printf("| ID | total |"); - for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } - printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str()); - print_separator(settings.parameters.size()); - - // First runs a reference example to compare against - try { - auto queue = Queue(context, device); - printf("| ref | - |"); - for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); } - printf(" - |"); - - - // Sets the input - for (const auto id : settings.inputs) { - device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); - } - - // Compiles the kernel - auto compiler_options = std::vector(); - const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name, - device, context, compiler_options, 0); - auto kernel = Kernel(program, settings.kernel_name); - C::SetArguments(kernel, args, device_buffers); - printf(" %sOK%s |", kPrintSuccess.c_str(), kPrintEnd.c_str()); - - // Runs the kernel - const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, - settings.global_size_ref, settings.local_size_ref); - printf(" - |"); - if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); } - - // Saves the result - for (const auto id : settings.outputs) { - device_buffers[id].Read(queue, buffer_sizes[id], reference_buffers[id]); - } - printf(" %sreference OK%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); - } - catch (...) { - const auto status_code = DispatchExceptionCatchAll(true); - printf("* Exception caught with status %d while running the reference, aborting\n", - static_cast(status_code)); - return; - } - print_separator(settings.parameters.size()); - - // Starts the tuning process - auto results = std::vector(); - for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) { - try { - auto queue = Queue(context, device); - - auto configuration = configurations[config_id]; - printf("| %4zu | %5zu |", config_id + 1, configurations.size()); - for (const auto& parameter : settings.parameters) { - printf("%5zu", configuration.at(parameter.first)); - } - printf(" |"); - - // Sets the input - for (const auto id : settings.inputs) { - device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]); - } - - // Sets the thread configuration - const auto global = SetThreadConfiguration(configuration, settings.global_size, - settings.mul_global, settings.div_global); - const auto local = SetThreadConfiguration(configuration, settings.local_size, - settings.mul_local, settings.div_local); - - // Sets the parameters for this configuration - auto kernel_source = std::string{""}; - for (const auto ¶meter : configuration) { - kernel_source += "#define " + parameter.first + " " + ToString(parameter.second) + "\n"; - } - kernel_source += settings.sources; - - // Compiles the kernel - const auto start_time = std::chrono::steady_clock::now(); - auto compiler_options = std::vector(); - const auto program = CompileFromSource(kernel_source, args.precision, settings.kernel_name, - device, context, compiler_options, 0, true); - auto kernel = Kernel(program, settings.kernel_name); - const auto elapsed_time = std::chrono::steady_clock::now() - start_time; - const auto timing = std::chrono::duration(elapsed_time).count(); - printf(" %sOK%s %5.0lf ms |", kPrintSuccess.c_str(), kPrintEnd.c_str(), timing); - - // Runs the kernel - C::SetArguments(kernel, args, device_buffers); - const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device, global, local); - - // Kernel run was not successful - if (time_ms == -1.0) { - printf(" - |"); - printf(" %sinvalid config.%s |", kPrintError.c_str(), kPrintEnd.c_str()); - printf(" <-- skipping\n"); - continue; - } - - // Compares the results - auto l2_error = 0.0; - for (const auto id : settings.outputs) { - device_buffers[id].Read(queue, buffer_sizes[id], result_buffers[id]); - for (auto index = size_t{0}; index(buffer_sizes[id]); - if (std::isnan(l2_error) || l2_error > max_l2_norm) { - printf(" - |"); - printf(" %sL2 error %8.2e%s |", kPrintError.c_str(), l2_error, kPrintEnd.c_str()); - throw std::runtime_error("L2 error too large"); - } - } - - // All was OK - configuration["PRECISION"] = static_cast(args.precision); - results.push_back(TuningResult{settings.kernel_name, time_ms, configuration}); - printf(" %6.1lf |", settings.metric_amount / (time_ms * 1.0e6)); - printf(" %sresults match%s |\n", kPrintSuccess.c_str(), kPrintEnd.c_str()); - } - catch (CLCudaAPIBuildError) { - const auto status_code = DispatchExceptionCatchAll(true); - printf(" %scompilation error: %5d%s |", - kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); - printf(" - | - | <-- skipping\n"); - } - catch (...) { - const auto status_code = DispatchExceptionCatchAll(true); - if (status_code != StatusCode::kUnknownError) { - printf(" %serror code %d%s |", - kPrintError.c_str(), static_cast(status_code), kPrintEnd.c_str()); - } - printf(" <-- skipping\n"); - } - } - - // Completed the tuning process - print_separator(settings.parameters.size()); - printf("\n"); - if (results.size() == 0) { return; } - - // Computes the best results - auto comparison = [](const TuningResult& lhs, const TuningResult& rhs) { return lhs.score < rhs.score; }; - const auto best_configuration = std::min_element(results.begin(), results.end(), comparison); - const auto best_time_ms = best_configuration->score; - if (best_time_ms == 0.0) { return; } - - // Also prints the performance of the best-case in terms of GB/s or GFLOPS - printf("\n"); - printf("* Found best result %.2lf ms", best_time_ms); - printf(": %.1lf %s\n", settings.metric_amount / (best_time_ms * 1.0e6), - settings.performance_unit.c_str()); - printf("* Best parameters: "); - auto best_string = std::string{""}; - auto i = size_t{0}; - for (const auto config : best_configuration->config) { - best_string += "" + config.first + "=" + ToString(config.second); - if (i < best_configuration->config.size() - 1) { best_string += " "; } - ++i; - } - printf("%s\n\n", best_string.c_str()); - - // Outputs the results as JSON to disk, including some meta-data - auto precision_string = std::to_string(static_cast(args.precision)); - auto metadata = std::vector>{ - {"kernel_family", settings.kernel_family}, - {"precision", precision_string}, - {"best_kernel", best_configuration->name}, - {"best_time", ToString(best_configuration->score)}, - {"best_parameters", best_string} - }; - for (auto &o: defaults.options) { - if (o == kArgM) { metadata.push_back({"arg_m", ToString(args.m)}); } - if (o == kArgN) { metadata.push_back({"arg_n", ToString(args.n)}); } - if (o == kArgK) { metadata.push_back({"arg_k", ToString(args.k)}); } - if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } - if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } - if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } - } - PrintTimingsToFileAsJSON("clblast_" + settings.kernel_family + "_" + precision_string + ".json", - device, platform, metadata, results); - - printf("* Completed tuning process\n"); - printf("\n"); -} +template +void Tuner(int argc, char* argv[], const int V, + GetTunerDefaultsFunc GetTunerDefaults, + GetTunerSettingsFunc GetTunerSettings, + TestValidArgumentsFunc TestValidArguments, + SetConstraintsFunc SetConstraints, + SetArgumentsFunc SetArguments); // ================================================================================================= } // namespace clblast -- cgit v1.2.3