summaryrefslogtreecommitdiff
path: root/src/tuning/kernels/xgemv.cpp
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-11-19 12:58:41 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-11-19 12:58:41 +0100
commit7a54494577ccee401b63cfa82688661fc66f59a4 (patch)
treec97a969961a80dad72797fdd2ce619a2b40d34bc /src/tuning/kernels/xgemv.cpp
parent8a5a5e031e3552ef36d7b3a16ecf5cef6cdb4614 (diff)
Modified the kernel tuners to use the newly integrated auto-tuner
Diffstat (limited to 'src/tuning/kernels/xgemv.cpp')
-rw-r--r--src/tuning/kernels/xgemv.cpp73
1 files changed, 31 insertions, 42 deletions
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index e66b15f1..3eadd32b 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -7,7 +7,7 @@
// Author(s):
// Cedric Nugteren <www.cedricnugteren.nl>
//
-// This file uses the CLTune auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
+// This file uses the auto-tuner to tune the xgemv OpenCL kernels. Three variants are tuned:
// 1: The full version of the kernel
// 2: The fast version for non-transposed matrices
// 3: The fast version for transposed matrices
@@ -45,7 +45,6 @@ class TuneXgemv {
settings.kernel_family = (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot");
settings.kernel_name = (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot");
settings.sources =
-#include "../src/kernels/common.opencl"
#include "../src/kernels/level2/xgemv.opencl"
#include "../src/kernels/level2/xgemv_fast.opencl"
;
@@ -55,6 +54,10 @@ class TuneXgemv {
settings.size_y = args.m;
settings.size_a = args.m * args.n;
+ // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+ settings.inputs = {0, 1, 2};
+ settings.outputs = {1};
+
// Sets the base thread configuration
settings.global_size = {args.m};
settings.global_size_ref = settings.global_size;
@@ -63,9 +66,7 @@ class TuneXgemv {
// Transforms the thread configuration based on the parameters
settings.mul_local = {{"WGS"+std::to_string(V)}};
- settings.div_global = (V==1 || V==2) ?
- TunerSettings::TransformVector{{"WPT"+std::to_string(V)}} :
- TunerSettings::TransformVector{};
+ settings.div_global = (V==1 || V==2) ? TransformVector{{"WPT"+std::to_string(V)}} : TransformVector{};
// Sets the tuning parameters and their possible values
if (V==1) {
@@ -98,53 +99,41 @@ class TuneXgemv {
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
-
- // Sets the constraints and local memory size
- static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+ static std::vector<Constraint> SetConstraints() {
+ auto constraints = std::vector<Constraint>();
if (V==2 || V==3) {
auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
- tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
+ constraints.push_back({MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)}});
}
if (V==3) {
auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
- tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
- }
- }
- static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
- if (V==1 || V==2) {
- auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
- tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
- }
- else {
- auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
- tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+ constraints.push_back({LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)}});
}
+ return constraints;
}
// Sets the kernel's arguments
- static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
- std::vector<T> &x_vec, std::vector<T> &y_vec,
- std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
- std::vector<T> &) {
+ static void SetArguments(Kernel &kernel, const Arguments<T> &args,
+ std::vector<Buffer<T>>& buffers) {
auto a_rotated = (V==3) ? 1 : 0;
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentScalar(static_cast<int>(args.n));
- tuner.AddArgumentScalar(GetRealArg(args.alpha));
- tuner.AddArgumentScalar(GetRealArg(args.beta));
- tuner.AddArgumentScalar(static_cast<int>(a_rotated));
- tuner.AddArgumentInput(a_mat);
- tuner.AddArgumentScalar(0);
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentInput(x_vec);
- tuner.AddArgumentScalar(0);
- tuner.AddArgumentScalar(1);
- tuner.AddArgumentOutput(y_vec);
- tuner.AddArgumentScalar(0);
- tuner.AddArgumentScalar(1);
- tuner.AddArgumentScalar(0); // Conjugate transpose
- tuner.AddArgumentScalar(0); // Additional parameter
- tuner.AddArgumentScalar(0); // Banded 'kl'
- tuner.AddArgumentScalar(0); // Banded 'ku'
+ kernel.SetArgument(0, static_cast<int>(args.m));
+ kernel.SetArgument(1, static_cast<int>(args.n));
+ kernel.SetArgument(2, GetRealArg(args.alpha));
+ kernel.SetArgument(3, GetRealArg(args.beta));
+ kernel.SetArgument(4, a_rotated);
+ kernel.SetArgument(5, buffers[2]()); // 2 == A matrix
+ kernel.SetArgument(6, 0);
+ kernel.SetArgument(7, static_cast<int>(args.m));
+ kernel.SetArgument(8, buffers[0]()); // 0 == X vector
+ kernel.SetArgument(9, 0);
+ kernel.SetArgument(10, 1);
+ kernel.SetArgument(11, buffers[1]()); // 1 == Y vector
+ kernel.SetArgument(12, 0);
+ kernel.SetArgument(13, 1);
+ kernel.SetArgument(14, 0); // Conjugate transpose
+ kernel.SetArgument(15, 0); // Additional parameter
+ kernel.SetArgument(16, 0); // Banded 'kl'
+ kernel.SetArgument(17, 0); // Banded 'ku'
}
};