diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2018-03-22 21:01:02 +0100 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2018-03-22 21:01:02 +0100 |
commit | 9fb6550dd02c54fafbb03e20516a394d9cd63f3f (patch) | |
tree | 143bd103186eb0d3fd9672ab8a41ebb076d1107d /src/tuning/kernels | |
parent | 7a2371213bb900c8c726c346f85920199f424d82 (diff) |
Added the OpenCL local memory size constraint to the tuners
Diffstat (limited to 'src/tuning/kernels')
-rw-r--r-- | src/tuning/kernels/copy_fast.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/copy_fast.hpp | 4 | ||||
-rw-r--r-- | src/tuning/kernels/copy_pad.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/copy_pad.hpp | 4 | ||||
-rw-r--r-- | src/tuning/kernels/invert.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/invert.hpp | 9 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_fast.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_fast.hpp | 9 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_pad.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/transpose_pad.hpp | 9 | ||||
-rw-r--r-- | src/tuning/kernels/xaxpy.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xaxpy.hpp | 4 | ||||
-rw-r--r-- | src/tuning/kernels/xdot.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xdot.hpp | 8 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm.hpp | 9 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm_direct.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xgemm_direct.hpp | 9 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xgemv.hpp | 17 | ||||
-rw-r--r-- | src/tuning/kernels/xger.cpp | 10 | ||||
-rw-r--r-- | src/tuning/kernels/xger.hpp | 4 |
22 files changed, 141 insertions, 55 deletions
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp index 0314113c..13f7ef3c 100644 --- a/src/tuning/kernels/copy_fast.cpp +++ b/src/tuning/kernels/copy_fast.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<half>, clblast::CopyTestValidArguments<half>, clblast::CopySetConstraints, clblast::CopySetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<float>, clblast::CopyTestValidArguments<float>, clblast::CopySetConstraints, clblast::CopySetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<double>, clblast::CopyTestValidArguments<double>, clblast::CopySetConstraints, clblast::CopySetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<float2>, clblast::CopyTestValidArguments<float2>, clblast::CopySetConstraints, clblast::CopySetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<double2>, clblast::CopyTestValidArguments<double2>, clblast::CopySetConstraints, clblast::CopySetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<half>, clblast::CopyTestValidArguments<half>, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize<half>, clblast::CopySetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<float>, clblast::CopyTestValidArguments<float>, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize<float>, clblast::CopySetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<double>, clblast::CopyTestValidArguments<double>, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize<double>, clblast::CopySetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<float2>, clblast::CopyTestValidArguments<float2>, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize<float2>, clblast::CopySetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::CopyGetTunerDefaults, clblast::CopyGetTunerSettings<double2>, clblast::CopyTestValidArguments<double2>, clblast::CopySetConstraints, clblast::CopyComputeLocalMemSize<double2>, clblast::CopySetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/copy_fast.hpp b/src/tuning/kernels/copy_fast.hpp index f9a58bc7..1c4219ae 100644 --- a/src/tuning/kernels/copy_fast.hpp +++ b/src/tuning/kernels/copy_fast.hpp @@ -79,6 +79,10 @@ TunerSettings CopyGetTunerSettings(const int, const Arguments<T> &args) { template <typename T> void CopyTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> CopySetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo CopyComputeLocalMemSize(const int) { + return { [] (std::vector<size_t>) -> size_t { return 0; }, {} }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp index 909a71c8..ffaed6ed 100644 --- a/src/tuning/kernels/copy_pad.cpp +++ b/src/tuning/kernels/copy_pad.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<half>, clblast::PadTestValidArguments<half>, clblast::PadSetConstraints, clblast::PadSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<float>, clblast::PadTestValidArguments<float>, clblast::PadSetConstraints, clblast::PadSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<double>, clblast::PadTestValidArguments<double>, clblast::PadSetConstraints, clblast::PadSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<float2>, clblast::PadTestValidArguments<float2>, clblast::PadSetConstraints, clblast::PadSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<double2>, clblast::PadTestValidArguments<double2>, clblast::PadSetConstraints, clblast::PadSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<half>, clblast::PadTestValidArguments<half>, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize<half>, clblast::PadSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<float>, clblast::PadTestValidArguments<float>, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize<float>, clblast::PadSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<double>, clblast::PadTestValidArguments<double>, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize<double>, clblast::PadSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<float2>, clblast::PadTestValidArguments<float2>, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize<float2>, clblast::PadSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::PadGetTunerDefaults, clblast::PadGetTunerSettings<double2>, clblast::PadTestValidArguments<double2>, clblast::PadSetConstraints, clblast::PadComputeLocalMemSize<double2>, clblast::PadSetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/copy_pad.hpp b/src/tuning/kernels/copy_pad.hpp index e612ca9e..ada1cf83 100644 --- a/src/tuning/kernels/copy_pad.hpp +++ b/src/tuning/kernels/copy_pad.hpp @@ -79,6 +79,10 @@ TunerSettings PadGetTunerSettings(const int, const Arguments<T> &args) { template <typename T> void PadTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> PadSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo PadComputeLocalMemSize(const int) { + return { [] (std::vector<size_t>) -> size_t { return 0; }, {} }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/invert.cpp b/src/tuning/kernels/invert.cpp index 3dfeb508..3795da88 100644 --- a/src/tuning/kernels/invert.cpp +++ b/src/tuning/kernels/invert.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<half>, clblast::InvertTestValidArguments<half>, clblast::InvertSetConstraints, clblast::InvertSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<float>, clblast::InvertTestValidArguments<float>, clblast::InvertSetConstraints, clblast::InvertSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<double>, clblast::InvertTestValidArguments<double>, clblast::InvertSetConstraints, clblast::InvertSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<float2>, clblast::InvertTestValidArguments<float2>, clblast::InvertSetConstraints, clblast::InvertSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<double2>, clblast::InvertTestValidArguments<double2>, clblast::InvertSetConstraints, clblast::InvertSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<half>, clblast::InvertTestValidArguments<half>, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize<half>, clblast::InvertSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<float>, clblast::InvertTestValidArguments<float>, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize<float>, clblast::InvertSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<double>, clblast::InvertTestValidArguments<double>, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize<double>, clblast::InvertSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<float2>, clblast::InvertTestValidArguments<float2>, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize<float2>, clblast::InvertSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::InvertGetTunerDefaults, clblast::InvertGetTunerSettings<double2>, clblast::InvertTestValidArguments<double2>, clblast::InvertSetConstraints, clblast::InvertComputeLocalMemSize<double2>, clblast::InvertSetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/invert.hpp b/src/tuning/kernels/invert.hpp index 0a0c9ce2..4f74674d 100644 --- a/src/tuning/kernels/invert.hpp +++ b/src/tuning/kernels/invert.hpp @@ -87,6 +87,15 @@ void InvertTestValidArguments(const int, const Arguments<T> &args) { } } std::vector<Constraint> InvertSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo InvertComputeLocalMemSize(const int) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * (16 + v[0]) * 16; + }, + {"LOCALPAD"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp index 6b37a31d..024f7385 100644 --- a/src/tuning/kernels/transpose_fast.cpp +++ b/src/tuning/kernels/transpose_fast.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<half>, clblast::TransposeTestValidArguments<half>, clblast::TransposeSetConstraints, clblast::TransposeSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<float>, clblast::TransposeTestValidArguments<float>, clblast::TransposeSetConstraints, clblast::TransposeSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<double>, clblast::TransposeTestValidArguments<double>, clblast::TransposeSetConstraints, clblast::TransposeSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<float2>, clblast::TransposeTestValidArguments<float2>, clblast::TransposeSetConstraints, clblast::TransposeSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<double2>, clblast::TransposeTestValidArguments<double2>, clblast::TransposeSetConstraints, clblast::TransposeSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<half>, clblast::TransposeTestValidArguments<half>, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize<half>, clblast::TransposeSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<float>, clblast::TransposeTestValidArguments<float>, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize<float>, clblast::TransposeSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<double>, clblast::TransposeTestValidArguments<double>, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize<double>, clblast::TransposeSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<float2>, clblast::TransposeTestValidArguments<float2>, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize<float2>, clblast::TransposeSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::TransposeGetTunerDefaults, clblast::TransposeGetTunerSettings<double2>, clblast::TransposeTestValidArguments<double2>, clblast::TransposeSetConstraints, clblast::TransposeComputeLocalMemSize<double2>, clblast::TransposeSetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/transpose_fast.hpp b/src/tuning/kernels/transpose_fast.hpp index e8917ad2..c6e3f98d 100644 --- a/src/tuning/kernels/transpose_fast.hpp +++ b/src/tuning/kernels/transpose_fast.hpp @@ -79,6 +79,15 @@ TunerSettings TransposeGetTunerSettings(const int, const Arguments<T> &args) { template <typename T> void TransposeTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> TransposeSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo TransposeComputeLocalMemSize(const int) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * v[1] * (v[1] * v[0]) * (v[0] + v[2]); + }, + {"TRA_DIM", "TRA_WPT", "TRA_PAD"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp index fc7244f6..ffaa252b 100644 --- a/src/tuning/kernels/transpose_pad.cpp +++ b/src/tuning/kernels/transpose_pad.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<half>, clblast::PadtransposeTestValidArguments<half>, clblast::PadtransposeSetConstraints, clblast::PadtransposeSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<float>, clblast::PadtransposeTestValidArguments<float>, clblast::PadtransposeSetConstraints, clblast::PadtransposeSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<double>, clblast::PadtransposeTestValidArguments<double>, clblast::PadtransposeSetConstraints, clblast::PadtransposeSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<float2>, clblast::PadtransposeTestValidArguments<float2>, clblast::PadtransposeSetConstraints, clblast::PadtransposeSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<double2>, clblast::PadtransposeTestValidArguments<double2>, clblast::PadtransposeSetConstraints, clblast::PadtransposeSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<half>, clblast::PadtransposeTestValidArguments<half>, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize<half>, clblast::PadtransposeSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<float>, clblast::PadtransposeTestValidArguments<float>, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize<float>, clblast::PadtransposeSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<double>, clblast::PadtransposeTestValidArguments<double>, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize<double>, clblast::PadtransposeSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<float2>, clblast::PadtransposeTestValidArguments<float2>, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize<float2>, clblast::PadtransposeSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::PadtransposeGetTunerDefaults, clblast::PadtransposeGetTunerSettings<double2>, clblast::PadtransposeTestValidArguments<double2>, clblast::PadtransposeSetConstraints, clblast::PadtransposeComputeLocalMemSize<double2>, clblast::PadtransposeSetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/transpose_pad.hpp b/src/tuning/kernels/transpose_pad.hpp index 8d24a0dc..ebc0e4fb 100644 --- a/src/tuning/kernels/transpose_pad.hpp +++ b/src/tuning/kernels/transpose_pad.hpp @@ -78,6 +78,15 @@ TunerSettings PadtransposeGetTunerSettings(const int, const Arguments<T> &args) template <typename T> void PadtransposeTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> PadtransposeSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo PadtransposeComputeLocalMemSize(const int) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * (v[1] * v[0]) * (v[1] * v[0] + v[2]); + }, + {"PADTRA_TILE", "PADTRA_WPT", "PADTRA_PAD"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp index 6a95600d..681876ea 100644 --- a/src/tuning/kernels/xaxpy.cpp +++ b/src/tuning/kernels/xaxpy.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<half>, clblast::XaxpyTestValidArguments<half>, clblast::XaxpySetConstraints, clblast::XaxpySetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<float>, clblast::XaxpyTestValidArguments<float>, clblast::XaxpySetConstraints, clblast::XaxpySetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<double>, clblast::XaxpyTestValidArguments<double>, clblast::XaxpySetConstraints, clblast::XaxpySetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<float2>, clblast::XaxpyTestValidArguments<float2>, clblast::XaxpySetConstraints, clblast::XaxpySetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<double2>, clblast::XaxpyTestValidArguments<double2>, clblast::XaxpySetConstraints, clblast::XaxpySetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<half>, clblast::XaxpyTestValidArguments<half>, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize<half>, clblast::XaxpySetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<float>, clblast::XaxpyTestValidArguments<float>, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize<float>, clblast::XaxpySetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<double>, clblast::XaxpyTestValidArguments<double>, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize<double>, clblast::XaxpySetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<float2>, clblast::XaxpyTestValidArguments<float2>, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize<float2>, clblast::XaxpySetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::XaxpyGetTunerDefaults, clblast::XaxpyGetTunerSettings<double2>, clblast::XaxpyTestValidArguments<double2>, clblast::XaxpySetConstraints, clblast::XaxpyComputeLocalMemSize<double2>, clblast::XaxpySetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/xaxpy.hpp b/src/tuning/kernels/xaxpy.hpp index 24550ed9..ab2c45f0 100644 --- a/src/tuning/kernels/xaxpy.hpp +++ b/src/tuning/kernels/xaxpy.hpp @@ -81,6 +81,10 @@ void XaxpyTestValidArguments(const int, const Arguments<T> &args) { } } std::vector<Constraint> XaxpySetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo XaxpyComputeLocalMemSize(const int) { + return { [] (std::vector<size_t>) -> size_t { return 0; }, {} }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp index 6d10c4d8..a481f23b 100644 --- a/src/tuning/kernels/xdot.cpp +++ b/src/tuning/kernels/xdot.cpp @@ -24,11 +24,11 @@ template <int V> void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<half>, clblast::XdotTestValidArguments<half>, clblast::XdotSetConstraints, clblast::XdotSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<float>, clblast::XdotTestValidArguments<float>, clblast::XdotSetConstraints, clblast::XdotSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<double>, clblast::XdotTestValidArguments<double>, clblast::XdotSetConstraints, clblast::XdotSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<float2>, clblast::XdotTestValidArguments<float2>, clblast::XdotSetConstraints, clblast::XdotSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<double2>, clblast::XdotTestValidArguments<double2>, clblast::XdotSetConstraints, clblast::XdotSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<half>, clblast::XdotTestValidArguments<half>, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize<half>, clblast::XdotSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<float>, clblast::XdotTestValidArguments<float>, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize<float>, clblast::XdotSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<double>, clblast::XdotTestValidArguments<double>, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize<double>, clblast::XdotSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<float2>, clblast::XdotTestValidArguments<float2>, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize<float2>, clblast::XdotSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XdotGetTunerDefaults, clblast::XdotGetTunerSettings<double2>, clblast::XdotTestValidArguments<double2>, clblast::XdotSetConstraints, clblast::XdotComputeLocalMemSize<double2>, clblast::XdotSetArguments<double2>); break; } } diff --git a/src/tuning/kernels/xdot.hpp b/src/tuning/kernels/xdot.hpp index 15673c79..901d8fd0 100644 --- a/src/tuning/kernels/xdot.hpp +++ b/src/tuning/kernels/xdot.hpp @@ -76,6 +76,14 @@ TunerSettings XdotGetTunerSettings(const int V, const Arguments<T> &args) { template <typename T> void XdotTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> XdotSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo XdotComputeLocalMemSize(const int V) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * v[0]; + }, + {"WGS"+std::to_string(V)} + };} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index d365ce6d..85948373 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -23,11 +23,11 @@ template <int V> void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<half>, clblast::XgemmTestValidArguments<half>, clblast::XgemmSetConstraints, clblast::XgemmSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<float>, clblast::XgemmTestValidArguments<float>, clblast::XgemmSetConstraints, clblast::XgemmSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<double>, clblast::XgemmTestValidArguments<double>, clblast::XgemmSetConstraints, clblast::XgemmSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<float2>, clblast::XgemmTestValidArguments<float2>, clblast::XgemmSetConstraints, clblast::XgemmSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<double2>, clblast::XgemmTestValidArguments<double2>, clblast::XgemmSetConstraints, clblast::XgemmSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<half>, clblast::XgemmTestValidArguments<half>, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize<half>, clblast::XgemmSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<float>, clblast::XgemmTestValidArguments<float>, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize<float>, clblast::XgemmSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<double>, clblast::XgemmTestValidArguments<double>, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize<double>, clblast::XgemmSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<float2>, clblast::XgemmTestValidArguments<float2>, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize<float2>, clblast::XgemmSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemmGetTunerDefaults, clblast::XgemmGetTunerSettings<double2>, clblast::XgemmTestValidArguments<double2>, clblast::XgemmSetConstraints, clblast::XgemmComputeLocalMemSize<double2>, clblast::XgemmSetArguments<double2>); break; } } diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp index 66e197e1..5f191ba9 100644 --- a/src/tuning/kernels/xgemm.hpp +++ b/src/tuning/kernels/xgemm.hpp @@ -145,6 +145,15 @@ std::vector<Constraint> XgemmSetConstraints(const int V) { } return constraints; } +template <typename T> +LocalMemSizeInfo XgemmComputeLocalMemSize(const int) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * ((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5])); + }, + {"SA", "KWG", "MWG", "SB", "KWG", "NWG"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp index 7298a6c3..73c2217c 100644 --- a/src/tuning/kernels/xgemm_direct.cpp +++ b/src/tuning/kernels/xgemm_direct.cpp @@ -23,11 +23,11 @@ template <int V> void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<half>, clblast::XgemmDirectTestValidArguments<half>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<float>, clblast::XgemmDirectTestValidArguments<float>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<double>, clblast::XgemmDirectTestValidArguments<double>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<float2>, clblast::XgemmDirectTestValidArguments<float2>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<double2>, clblast::XgemmDirectTestValidArguments<double2>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<half>, clblast::XgemmDirectTestValidArguments<half>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize<half>, clblast::XgemmDirectSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<float>, clblast::XgemmDirectTestValidArguments<float>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize<float>, clblast::XgemmDirectSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<double>, clblast::XgemmDirectTestValidArguments<double>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize<double>, clblast::XgemmDirectSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<float2>, clblast::XgemmDirectTestValidArguments<float2>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize<float2>, clblast::XgemmDirectSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemmDirectGetTunerDefaults, clblast::XgemmDirectGetTunerSettings<double2>, clblast::XgemmDirectTestValidArguments<double2>, clblast::XgemmDirectSetConstraints, clblast::XgemmDirectComputeLocalMemSize<double2>, clblast::XgemmDirectSetArguments<double2>); break; } } diff --git a/src/tuning/kernels/xgemm_direct.hpp b/src/tuning/kernels/xgemm_direct.hpp index ecb10bc6..baa063c0 100644 --- a/src/tuning/kernels/xgemm_direct.hpp +++ b/src/tuning/kernels/xgemm_direct.hpp @@ -135,6 +135,15 @@ std::vector<Constraint> XgemmDirectSetConstraints(const int V) { } return constraints; } +template <typename T> +LocalMemSizeInfo XgemmDirectComputeLocalMemSize(const int) { + return { + [] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))); + }, + {"WGD", "PADA", "PADB"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp index 9e45d73f..6505a081 100644 --- a/src/tuning/kernels/xgemv.cpp +++ b/src/tuning/kernels/xgemv.cpp @@ -23,11 +23,11 @@ template <int V> void StartVariation(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<half>, clblast::XgemvTestValidArguments<half>, clblast::XgemvSetConstraints, clblast::XgemvSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<float>, clblast::XgemvTestValidArguments<float>, clblast::XgemvSetConstraints, clblast::XgemvSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<double>, clblast::XgemvTestValidArguments<double>, clblast::XgemvSetConstraints, clblast::XgemvSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<float2>, clblast::XgemvTestValidArguments<float2>, clblast::XgemvSetConstraints, clblast::XgemvSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<double2>, clblast::XgemvTestValidArguments<double2>, clblast::XgemvSetConstraints, clblast::XgemvSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<half>, clblast::XgemvTestValidArguments<half>, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize<half>, clblast::XgemvSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<float>, clblast::XgemvTestValidArguments<float>, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize<float>, clblast::XgemvSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<double>, clblast::XgemvTestValidArguments<double>, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize<double>, clblast::XgemvSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<float2>, clblast::XgemvTestValidArguments<float2>, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize<float2>, clblast::XgemvSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, V, clblast::XgemvGetTunerDefaults, clblast::XgemvGetTunerSettings<double2>, clblast::XgemvTestValidArguments<double2>, clblast::XgemvSetConstraints, clblast::XgemvComputeLocalMemSize<double2>, clblast::XgemvSetArguments<double2>); break; } } diff --git a/src/tuning/kernels/xgemv.hpp b/src/tuning/kernels/xgemv.hpp index e44efe32..c582816e 100644 --- a/src/tuning/kernels/xgemv.hpp +++ b/src/tuning/kernels/xgemv.hpp @@ -109,6 +109,23 @@ std::vector<Constraint> XgemvSetConstraints(const int V) { } return constraints; } +template <typename T> +LocalMemSizeInfo XgemvComputeLocalMemSize(const int V) { + if (V == 1 || V == 2) { + return { + [V] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * v[0]; + }, + {"WGS" + std::to_string(V)} + }; + } + return { + [V] (std::vector<size_t> v) -> size_t { + return GetBytes(PrecisionValue<T>()) * (v[0] + v[1] * v[2]); + }, + {"WGS3", "WPT3", "WGS3"} + }; +} // Sets the kernel's arguments template <typename T> diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp index 6dfc9ffa..e4c9fc03 100644 --- a/src/tuning/kernels/xger.cpp +++ b/src/tuning/kernels/xger.cpp @@ -22,11 +22,11 @@ using double2 = clblast::double2; int main(int argc, char *argv[]) { const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); switch(clblast::GetPrecision(command_line_args)) { - case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<half>, clblast::XgerTestValidArguments<half>, clblast::XgerSetConstraints, clblast::XgerSetArguments<half>); break; - case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<float>, clblast::XgerTestValidArguments<float>, clblast::XgerSetConstraints, clblast::XgerSetArguments<float>); break; - case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<double>, clblast::XgerTestValidArguments<double>, clblast::XgerSetConstraints, clblast::XgerSetArguments<double>); break; - case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<float2>, clblast::XgerTestValidArguments<float2>, clblast::XgerSetConstraints, clblast::XgerSetArguments<float2>); break; - case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<double2>, clblast::XgerTestValidArguments<double2>, clblast::XgerSetConstraints, clblast::XgerSetArguments<double2>); break; + case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<half>, clblast::XgerTestValidArguments<half>, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize<half>, clblast::XgerSetArguments<half>); break; + case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<float>, clblast::XgerTestValidArguments<float>, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize<float>, clblast::XgerSetArguments<float>); break; + case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<double>, clblast::XgerTestValidArguments<double>, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize<double>, clblast::XgerSetArguments<double>); break; + case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<float2>, clblast::XgerTestValidArguments<float2>, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize<float2>, clblast::XgerSetArguments<float2>); break; + case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::XgerGetTunerDefaults, clblast::XgerGetTunerSettings<double2>, clblast::XgerTestValidArguments<double2>, clblast::XgerSetConstraints, clblast::XgerComputeLocalMemSize<double2>, clblast::XgerSetArguments<double2>); break; } return 0; } diff --git a/src/tuning/kernels/xger.hpp b/src/tuning/kernels/xger.hpp index afd2f36e..0473572d 100644 --- a/src/tuning/kernels/xger.hpp +++ b/src/tuning/kernels/xger.hpp @@ -79,6 +79,10 @@ TunerSettings XgerGetTunerSettings(const int, const Arguments<T> &args) { template <typename T> void XgerTestValidArguments(const int, const Arguments<T> &) { } std::vector<Constraint> XgerSetConstraints(const int) { return {}; } +template <typename T> +LocalMemSizeInfo XgerComputeLocalMemSize(const int) { + return { [] (std::vector<size_t>) -> size_t { return 0; }, {} }; +} // Sets the kernel's arguments template <typename T> |