From 7a54494577ccee401b63cfa82688661fc66f59a4 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 19 Nov 2017 12:58:41 +0100 Subject: Modified the kernel tuners to use the newly integrated auto-tuner --- src/tuning/kernels/copy_fast.cpp | 26 +++++----- src/tuning/kernels/copy_pad.cpp | 42 ++++++++------- src/tuning/kernels/transpose_fast.cpp | 31 +++++------ src/tuning/kernels/transpose_pad.cpp | 47 ++++++++--------- src/tuning/kernels/xaxpy.cpp | 26 +++++----- src/tuning/kernels/xdot.cpp | 46 ++++++++--------- src/tuning/kernels/xgemm.cpp | 81 +++++++++++------------------ src/tuning/kernels/xgemm_direct.cpp | 96 ++++++++++++++--------------------- src/tuning/kernels/xgemv.cpp | 73 +++++++++++--------------- src/tuning/kernels/xger.cpp | 44 ++++++++-------- 10 files changed, 218 insertions(+), 294 deletions(-) (limited to 'src/tuning/kernels') diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 068c5f1b..462107d3 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the copy OpenCL kernels. +// This file uses the auto-tuner to tune the copy OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneCopy { settings.kernel_family = "copy"; settings.kernel_name = "CopyMatrixFast"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_fast.opencl" ; @@ -51,6 +50,10 @@ class TuneCopy { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,20 +81,15 @@ class TuneCopy { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); } }; diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 7102d05d..24557517 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the pad OpenCL kernels. +// This file uses the auto-tuner to tune the pad OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TunePad { settings.kernel_family = "pad"; settings.kernel_name = "CopyPadMatrix"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/copy_pad.opencl" ; @@ -51,6 +50,10 @@ class TunePad { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,28 +81,23 @@ class TunePad { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast(args.m)); + kernel.SetArgument(6, static_cast(args.n)); + kernel.SetArgument(7, static_cast(args.m)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); } }; diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 56726903..1e0d3c7b 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the transpose OpenCL kernels. +// This file uses the auto-tuner to tune the transpose OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneTranspose { settings.kernel_family = "transpose"; settings.kernel_name = "TransposeMatrixFast"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_fast.opencl" ; @@ -51,6 +50,10 @@ class TuneTranspose { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,25 +81,15 @@ class TuneTranspose { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"TRA_DIM", "TRA_WPT", "TRA_PAD"}); - } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, buffers[2]()); // 2 == A matrix + kernel.SetArgument(2, buffers[3]()); // 3 == B matrix + kernel.SetArgument(3, GetRealArg(args.alpha)); } }; diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index dc46e903..087f8e67 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the padtranspose OpenCL kernels. +// This file uses the auto-tuner to tune the pad-transpose OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TunePadTranspose { settings.kernel_family = "padtranspose"; settings.kernel_name = "TransposePadMatrix"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/level3.opencl" #include "../src/kernels/level3/transpose_pad.opencl" ; @@ -51,6 +50,10 @@ class TunePadTranspose { settings.size_a = args.m * args.n; settings.size_b = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -77,33 +80,23 @@ class TunePadTranspose { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*v[1]*(v[0]*v[1]+v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"}); - } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(0); - tuner.AddArgumentOutput(b_mat); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.m)); + kernel.SetArgument(3, 0); + kernel.SetArgument(4, buffers[2]()); // 2 == A matrix + kernel.SetArgument(5, static_cast(args.n)); + kernel.SetArgument(6, static_cast(args.m)); + kernel.SetArgument(7, static_cast(args.n)); + kernel.SetArgument(8, 0); + kernel.SetArgument(9, buffers[3]()); // 3 == B matrix + kernel.SetArgument(10, GetRealArg(args.alpha)); + kernel.SetArgument(11, 0); } }; diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index e201949a..d843ea78 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xaxpy OpenCL kernels. +// This file uses the auto-tuner to tune the xaxpy OpenCL kernels. // // ================================================================================================= @@ -41,7 +41,6 @@ class TuneXaxpy { settings.kernel_family = "xaxpy"; settings.kernel_name = "XaxpyFastest"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level1/level1.opencl" #include "../src/kernels/level1/xaxpy.opencl" ; @@ -50,6 +49,10 @@ class TuneXaxpy { settings.size_x = args.n; settings.size_y = args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1}; + settings.outputs = {1}; + // Sets the base thread configuration settings.global_size = {args.n}; settings.global_size_ref = settings.global_size; @@ -80,20 +83,15 @@ class TuneXaxpy { throw std::runtime_error("'XaxpyFastest' requires 'n' to be a multiple of WGS*WPT*VW"); } } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentOutput(y_vec); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.n)); + kernel.SetArgument(1, GetRealArg(args.alpha)); + kernel.SetArgument(2, buffers[0]()); // 0 == X vector + kernel.SetArgument(3, buffers[1]()); // 1 == Y vector } }; diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index fb532680..12350657 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xdot OpenCL kernels. Note that the results are +// This file uses the auto-tuner to tune the xdot OpenCL kernels. Note that the results are // not verified, since the result is not final and depends on the WGS2 parameter. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneXdot { settings.kernel_family = "xdot_"+std::to_string(V); settings.kernel_name = (V==1) ? "Xdot" : "XdotEpilogue"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level1/xdot.opencl" ; @@ -51,6 +50,10 @@ class TuneXdot { settings.size_y = args.n; settings.size_temp = args.n; // Worst case + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 5}; + settings.outputs = {}; // no output checking + // Sets the base thread configuration settings.global_size = (V==1) ? std::vector{2*64} : std::vector{1}; settings.global_size_ref = (V==1) ? std::vector{2*64*64} : std::vector{64}; @@ -58,8 +61,8 @@ class TuneXdot { settings.local_size_ref = {64}; // Transforms the thread configuration based on the parameters - settings.mul_local = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; - settings.mul_global = (V==1) ? TunerSettings::TransformVector{{"WGS1"}} : TunerSettings::TransformVector{{"WGS2"}}; + settings.mul_local = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; + settings.mul_global = (V==1) ? TransformVector{{"WGS1"}} : TransformVector{{"WGS2"}}; // Sets the tuning parameters and their possible values settings.parameters = { @@ -75,31 +78,26 @@ class TuneXdot { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &, std::vector &, std::vector &, - std::vector &temp) { + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { if (V == 1) { - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentInput(temp); // No output checking for the result - size varies - tuner.AddArgumentScalar(static_cast(false)); + kernel.SetArgument(0, static_cast(args.n)); + kernel.SetArgument(1, buffers[0]()); // 0 == X vector + kernel.SetArgument(2, 0); + kernel.SetArgument(3, 1); + kernel.SetArgument(4, buffers[1]()); // 1 == Y vector + kernel.SetArgument(5, 0); + kernel.SetArgument(6, 1); + kernel.SetArgument(7, buffers[5]()); // 5 == temp; no output checking - size varies + kernel.SetArgument(8, static_cast(false)); } else { - tuner.AddArgumentInput(temp); - tuner.AddArgumentInput(x_vec); // No output checking for the result - store somewhere - tuner.AddArgumentScalar(0); + kernel.SetArgument(0, buffers[5]()); // 5 == temp + kernel.SetArgument(1, buffers[0]()); // 0 == X vector; no output checking - size varies + kernel.SetArgument(2, 0); } } }; diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 6dcdf68b..16e32988 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations: +// This file uses the auto-tuner to tune the xgemm OpenCL kernels. There are two variations: // - V==1: This tests some limited set of tuning parameters exhaustively. // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // @@ -38,7 +38,6 @@ class TuneXgemm { settings.default_k = 1024; settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly settings.default_num_runs = 2; - settings.default_heuristic = static_cast(cltune::SearchMethod::RandomSearch); return settings; } @@ -50,7 +49,6 @@ class TuneXgemm { settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2"; settings.kernel_name = "Xgemm"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_part1.opencl" #include "../src/kernels/level3/xgemm_part2.opencl" #include "../src/kernels/level3/xgemm_part3.opencl" @@ -61,6 +59,10 @@ class TuneXgemm { settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -114,74 +116,51 @@ class TuneXgemm { settings.metric_amount = 2 * args.m * args.n * args.k; settings.performance_unit = "GFLOPS"; - // Returns which search heuristic to use - if (V==1) { settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); - } else { - settings.heuristic = args.heuristic_selection; - } - } - return settings; } // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector SetConstraints() { + auto constraints = std::vector(); auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the KWG loop - tuner.AddConstraint(id, MultipleOfX, {"KWG", "KWI"}); + constraints.push_back({MultipleOfX, {"KWG", "KWI"}}); // Required for integer MWI and NWI - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}); + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMC", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMC", "VWN"}}); // Required for integer MWIA and NWIB - tuner.AddConstraint(id, MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}); + constraints.push_back({MultipleOfXMulY, {"MWG", "MDIMA", "VWM"}}); + constraints.push_back({MultipleOfXMulY, {"NWG", "NDIMB", "VWN"}}); // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}); + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"}}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; - tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"}); - tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"}); - tuner.AddConstraint(id, IsEqual, {"SA", "SB"}); + constraints.push_back({IsEqual, {"MDIMC", "MDIMA"}}); + constraints.push_back({IsEqual, {"NDIMC", "NDIMB"}}); + constraints.push_back({IsEqual, {"SA", "SB"}}); } - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", - "SB", "KWG", "NWG"}); + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentOutput(c_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(0); + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, buffers[3]()); // 3 == B matrix + kernel.SetArgument(7, buffers[4]()); // 4 == C matrix + kernel.SetArgument(8, 0); + kernel.SetArgument(9, 0); } }; diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 619fb37a..60a983b4 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations: +// This file uses the auto-tuner to tune the direct xgemm kernels. There are two variations: // - V==1: This tests some limited set of tuning parameters exhaustively. // - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset. // @@ -36,9 +36,8 @@ class TuneXgemmDirect { settings.default_m = 256; settings.default_n = 256; settings.default_k = 256; - settings.default_fraction = (V==1) ? 1.0 : 32.0; // test all or sample randomly + settings.default_fraction = (V==1) ? 1.0 : 64.0; // test all or sample randomly settings.default_num_runs = 4; - settings.default_heuristic = static_cast(cltune::SearchMethod::RandomSearch); return settings; } @@ -50,7 +49,6 @@ class TuneXgemmDirect { settings.kernel_family = (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; settings.kernel_name = "XgemmDirectTN"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level3/xgemm_direct_part1.opencl" #include "../src/kernels/level3/xgemm_direct_part2.opencl" #include "../src/kernels/level3/xgemm_direct_part3.opencl" @@ -61,6 +59,10 @@ class TuneXgemmDirect { settings.size_b = args.n * args.k; settings.size_c = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3, 4}; + settings.outputs = {4}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -89,7 +91,7 @@ class TuneXgemmDirect { } else { // a lot more tuning parameters - has to be sampled randomly, too much to test all settings.parameters = { - {"WGD", {8, 16, 32, 64, 128}}, + {"WGD", {8, 16, 32, 64}}, {"MDIMCD", {8, 16, 32}}, {"NDIMCD", {8, 16, 32}}, {"MDIMAD", {8, 16, 32}}, @@ -106,79 +108,57 @@ class TuneXgemmDirect { settings.metric_amount = 2 * args.m * args.n * args.k; settings.performance_unit = "GFLOPS"; - // Returns which search heuristic to use - if (V==1) { settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); } - else { - // Use full-search to explore all parameter combinations or another strategy to search only a - // part of the parameter values. The fraction is set as a command-line argument. - if (args.fraction == 1.0 || args.fraction == 0.0) { - settings.heuristic = static_cast(cltune::SearchMethod::FullSearch); - } else { - settings.heuristic = args.heuristic_selection; - } - } - return settings; } // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector SetConstraints() { + auto constraints = std::vector(); auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; auto MultipleOfXMulY = [] (std::vector v) { return IsMultiple(v[0], v[1]*v[2]); }; auto MultipleOfXMulYDivZ = [] (std::vector v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); }; // Requirement for unrolling the WGD loop - tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"}); + constraints.push_back({MultipleOfX, {"WGD", "KWID"}}); // Required for integer MWID and NWID - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}); + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"}}); // Required for integer MWIAD and NWIBD - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}); - tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}); + constraints.push_back({MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"}}); + constraints.push_back({MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"}}); // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...) - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}); - tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}); + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"}}); + constraints.push_back({MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"}}); // Extra constraints for variation 1 to limit the set of options significantly if (V==1) { auto IsEqual = [] (std::vector v) { return v[0] == v[1]; }; - tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"}); - tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"}); + constraints.push_back({IsEqual, {"MDIMCD", "MDIMAD"}}); + constraints.push_back({IsEqual, {"NDIMCD", "NDIMBD"}}); } - } - - // Sets the local memory size - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - auto LocalMemorySize = [args] (std::vector v) { - return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision)); - }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"}); + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &, std::vector &, - std::vector &a_mat, std::vector &b_mat, std::vector &c_mat, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(static_cast(args.k)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast(args.k)); // a_ld - tuner.AddArgumentInput(b_mat); - tuner.AddArgumentScalar(0); // b_offset - tuner.AddArgumentScalar(static_cast(args.n)); // b_ld - tuner.AddArgumentOutput(c_mat); - tuner.AddArgumentScalar(0); // c_offset - tuner.AddArgumentScalar(static_cast(args.n)); // c_ld - tuner.AddArgumentScalar(1); // c_do_transpose - tuner.AddArgumentScalar(0); // a_conjugate - tuner.AddArgumentScalar(0); // b_conjugate + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, static_cast(args.k)); + kernel.SetArgument(3, GetRealArg(args.alpha)); + kernel.SetArgument(4, GetRealArg(args.beta)); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); // a_offset + kernel.SetArgument(7, static_cast(args.k)); // a_ld + kernel.SetArgument(8, buffers[3]()); // 3 == B matrix + kernel.SetArgument(9, 0); // b_offset + kernel.SetArgument(10, static_cast(args.n)); // b_ld + kernel.SetArgument(11, buffers[4]()); // 4 == C matrix + kernel.SetArgument(12, 0); // c_offset + kernel.SetArgument(13, static_cast(args.n)); // c_ld + kernel.SetArgument(14, 1); // c_do_transpose + kernel.SetArgument(15, 0); // a_conjugate + kernel.SetArgument(16, 0); // b_conjugate } }; diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index e66b15f1..3eadd32b 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: +// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned: // 1: The full version of the kernel // 2: The fast version for non-transposed matrices // 3: The fast version for transposed matrices @@ -45,7 +45,6 @@ class TuneXgemv { settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level2/xgemv.opencl" #include "../src/kernels/level2/xgemv_fast.opencl" ; @@ -55,6 +54,10 @@ class TuneXgemv { settings.size_y = args.m; settings.size_a = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {1}; + // Sets the base thread configuration settings.global_size = {args.m}; settings.global_size_ref = settings.global_size; @@ -63,9 +66,7 @@ class TuneXgemv { // Transforms the thread configuration based on the parameters settings.mul_local = {{"WGS"+std::to_string(V)}}; - settings.div_global = (V==1 || V==2) ? - TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} : - TunerSettings::TransformVector{}; + settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{}; // Sets the tuning parameters and their possible values if (V==1) { @@ -98,53 +99,41 @@ class TuneXgemv { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &tuner, const size_t id) { + static std::vector SetConstraints() { + auto constraints = std::vector(); if (V==2 || V==3) { auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; - tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}); + constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}}); } if (V==3) { auto LargerOrEqual = [] (std::vector v) { return v[0] >= v[1]; }; - tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); - } - } - static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments &args) { - if (V==1 || V==2) { - auto LocalMemorySize = [args] (std::vector v) { return v[0]*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)}); - } - else { - auto LocalMemorySize = [args] (std::vector v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); }; - tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}); + constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}}); } + return constraints; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { auto a_rotated = (V==3) ? 1 : 0; - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentScalar(GetRealArg(args.beta)); - tuner.AddArgumentScalar(static_cast(a_rotated)); - tuner.AddArgumentInput(a_mat); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentOutput(y_vec); - tuner.AddArgumentScalar(0); - tuner.AddArgumentScalar(1); - tuner.AddArgumentScalar(0); // Conjugate transpose - tuner.AddArgumentScalar(0); // Additional parameter - tuner.AddArgumentScalar(0); // Banded 'kl' - tuner.AddArgumentScalar(0); // Banded 'ku' + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, GetRealArg(args.beta)); + kernel.SetArgument(4, a_rotated); + kernel.SetArgument(5, buffers[2]()); // 2 == A matrix + kernel.SetArgument(6, 0); + kernel.SetArgument(7, static_cast(args.m)); + kernel.SetArgument(8, buffers[0]()); // 0 == X vector + kernel.SetArgument(9, 0); + kernel.SetArgument(10, 1); + kernel.SetArgument(11, buffers[1]()); // 1 == Y vector + kernel.SetArgument(12, 0); + kernel.SetArgument(13, 1); + kernel.SetArgument(14, 0); // Conjugate transpose + kernel.SetArgument(15, 0); // Additional parameter + kernel.SetArgument(16, 0); // Banded 'kl' + kernel.SetArgument(17, 0); // Banded 'ku' } }; diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index c2eb1d31..745e553f 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -7,7 +7,7 @@ // Author(s): // Cedric Nugteren // -// This file uses the CLTune auto-tuner to tune the xger OpenCL kernels. +// This file uses the auto-tuner to tune the xger OpenCL kernels. // // ================================================================================================= @@ -42,7 +42,6 @@ class TuneXger { settings.kernel_family = "xger"; settings.kernel_name = "Xger"; settings.sources = -#include "../src/kernels/common.opencl" #include "../src/kernels/level2/level2.opencl" #include "../src/kernels/level2/xger.opencl" ; @@ -52,6 +51,10 @@ class TuneXger { settings.size_y = args.n; settings.size_a = args.m * args.n; + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {0, 1, 2}; + settings.outputs = {2}; + // Sets the base thread configuration settings.global_size = {args.m, args.n}; settings.global_size_ref = settings.global_size; @@ -78,29 +81,24 @@ class TuneXger { // Tests for valid arguments static void TestValidArguments(const Arguments &) { } - - // Sets the constraints and local memory size - static void SetConstraints(cltune::Tuner &, const size_t) { } - static void SetLocalMemorySize(cltune::Tuner &, const size_t, const Arguments &) { } + static std::vector SetConstraints() { return {}; } // Sets the kernel's arguments - static void SetArguments(cltune::Tuner &tuner, const Arguments &args, - std::vector &x_vec, std::vector &y_vec, - std::vector &a_mat, std::vector &, std::vector &, - std::vector &) { - tuner.AddArgumentScalar(static_cast(args.m)); - tuner.AddArgumentScalar(static_cast(args.n)); - tuner.AddArgumentScalar(GetRealArg(args.alpha)); - tuner.AddArgumentInput(x_vec); - tuner.AddArgumentScalar(0); // x_offset - tuner.AddArgumentScalar(1); // x_increment - tuner.AddArgumentInput(y_vec); - tuner.AddArgumentScalar(0); // y_offset - tuner.AddArgumentScalar(1); // y_increment - tuner.AddArgumentOutput(a_mat); - tuner.AddArgumentScalar(0); // a_offset - tuner.AddArgumentScalar(static_cast(args.m)); // a_ld - tuner.AddArgumentScalar(0); // a_is_rowmajor + static void SetArguments(Kernel &kernel, const Arguments &args, + std::vector>& buffers) { + kernel.SetArgument(0, static_cast(args.m)); + kernel.SetArgument(1, static_cast(args.n)); + kernel.SetArgument(2, GetRealArg(args.alpha)); + kernel.SetArgument(3, buffers[0]()); // 0 == X vector + kernel.SetArgument(4, 0); // x_offset + kernel.SetArgument(5, 1); // x_increment + kernel.SetArgument(6, buffers[1]()); // 1 == Y vector + kernel.SetArgument(7, 0); // y_offset + kernel.SetArgument(8, 1); // y_increment + kernel.SetArgument(9, buffers[2]()); // 2 == A matrix + kernel.SetArgument(10, 0); // a_offset + kernel.SetArgument(11, static_cast(args.m)); // a_ld + kernel.SetArgument(12, 0); // a_is_rowmajor } }; -- cgit v1.2.3