summaryrefslogtreecommitdiff
path: root/src/tuning
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-06-13 20:46:01 +0200
committerCNugteren <web@cedricnugteren.nl>2015-06-13 20:46:01 +0200
commit4b3e3dcfe0a2bf97b2703b1f1fd1488c99244ff4 (patch)
tree4d74c65281d96a4eac06db975f99ac95c6133cdf /src/tuning
parent584f80c6663a167e117db38259c94d3f1df45156 (diff)
Added a fast GEMV kernel with vector loads, no tail, and fewer if-statements
Diffstat (limited to 'src/tuning')
-rw-r--r--src/tuning/tuning.cc1
-rw-r--r--src/tuning/xgemv.cc14
2 files changed, 10 insertions, 5 deletions
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
index 94333089..d617af88 100644
--- a/src/tuning/tuning.cc
+++ b/src/tuning/tuning.cc
@@ -87,6 +87,7 @@ void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
args.n = GetArgument(argc, argv, help, kArgN, size_t{1024});
args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+ args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kColMajor);
fprintf(stdout, "%s\n", help.c_str());
// Creates input buffers with random data
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
index 6037a5a0..e2d54729 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/xgemv.cc
@@ -33,29 +33,33 @@ void XgemvTune(const Arguments<T> &args,
std::string kernel_source =
#include "../src/kernels/xgemv.opencl"
auto sources = common_source + kernel_source;
- auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1});
+ auto id = tuner.AddKernelFromString(sources, "XgemvFast", {args.m}, {1});
tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
// Sets the tunable parameters and their possible values
tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048});
- tuner.AddParameter(id, "WPT", {1, 2, 4});
- tuner.AddParameter(id, "VW", {1});
+ tuner.AddParameter(id, "WPT", {1, 2, 4, 8});
+ tuner.AddParameter(id, "VW", {1, 2, 4, 8});
// Tests for a specific precision
tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+ // Sets the constraints
+ auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+ tuner.AddConstraint(id, MultipleOfX, {"WGS", "VW"});
+ tuner.AddConstraint(id, MultipleOfX, {"WPT", "VW"});
+
// Modifies the thread-sizes (local) based on the parameters
tuner.MulLocalSize(id, {"WGS"});
tuner.DivGlobalSize(id, {"WPT"});
- tuner.DivGlobalSize(id, {"VW"});
// Sets the function's arguments
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(args.alpha);
tuner.AddArgumentScalar(args.beta);
- tuner.AddArgumentScalar(0);
+ tuner.AddArgumentScalar(static_cast<int>(args.layout));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentScalar(0);
tuner.AddArgumentScalar(static_cast<int>(args.m));