From 4b3e3dcfe0a2bf97b2703b1f1fd1488c99244ff4 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sat, 13 Jun 2015 20:46:01 +0200 Subject: Added a fast GEMV kernel with vector loads, no tail, and fewer if-statements --- src/tuning/tuning.cc | 1 + src/tuning/xgemv.cc | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'src/tuning') diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc index 94333089..d617af88 100644 --- a/src/tuning/tuning.cc +++ b/src/tuning/tuning.cc @@ -87,6 +87,7 @@ void TunerAXY(int argc, char* argv[], const Tuner3 &tune_function) { args.n = GetArgument(argc, argv, help, kArgN, size_t{1024}); args.alpha = GetArgument(argc, argv, help, kArgAlpha, GetScalar()); args.beta = GetArgument(argc, argv, help, kArgBeta, GetScalar()); + args.layout = GetArgument(argc, argv, help, kArgLayout, Layout::kColMajor); fprintf(stdout, "%s\n", help.c_str()); // Creates input buffers with random data diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc index 6037a5a0..e2d54729 100644 --- a/src/tuning/xgemv.cc +++ b/src/tuning/xgemv.cc @@ -33,29 +33,33 @@ void XgemvTune(const Arguments &args, std::string kernel_source = #include "../src/kernels/xgemv.opencl" auto sources = common_source + kernel_source; - auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1}); + auto id = tuner.AddKernelFromString(sources, "XgemvFast", {args.m}, {1}); tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64}); // Sets the tunable parameters and their possible values tuner.AddParameter(id, "WGS", {64, 128, 256, 512, 1024, 1536, 2048}); - tuner.AddParameter(id, "WPT", {1, 2, 4}); - tuner.AddParameter(id, "VW", {1}); + tuner.AddParameter(id, "WPT", {1, 2, 4, 8}); + tuner.AddParameter(id, "VW", {1, 2, 4, 8}); // Tests for a specific precision tuner.AddParameter(id, "PRECISION", {static_cast(args.precision)}); tuner.AddParameterReference("PRECISION", static_cast(args.precision)); + // Sets the constraints + auto MultipleOfX = [] (std::vector v) { return IsMultiple(v[0], v[1]); }; + tuner.AddConstraint(id, MultipleOfX, {"WGS", "VW"}); + tuner.AddConstraint(id, MultipleOfX, {"WPT", "VW"}); + // Modifies the thread-sizes (local) based on the parameters tuner.MulLocalSize(id, {"WGS"}); tuner.DivGlobalSize(id, {"WPT"}); - tuner.DivGlobalSize(id, {"VW"}); // Sets the function's arguments tuner.AddArgumentScalar(static_cast(args.m)); tuner.AddArgumentScalar(static_cast(args.n)); tuner.AddArgumentScalar(args.alpha); tuner.AddArgumentScalar(args.beta); - tuner.AddArgumentScalar(0); + tuner.AddArgumentScalar(static_cast(args.layout)); tuner.AddArgumentInput(a_mat); tuner.AddArgumentScalar(0); tuner.AddArgumentScalar(static_cast(args.m)); -- cgit v1.2.3