From 8b658271dd38f1aaffbe94be8978cf5cea8ec7de Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 16 Nov 2022 00:20:04 +0100 Subject: Output matching from hera.wasserstein_distance --- src/python/gudhi/bottleneck.cc | 7 +- src/python/gudhi/hera/bottleneck.cc | 9 +- src/python/gudhi/hera/wasserstein.cc | 127 +++++++++++++++++++++++---- src/python/include/pybind11_diagram_utils.h | 25 +++--- src/python/test/test_wasserstein_distance.py | 13 +-- 5 files changed, 142 insertions(+), 39 deletions(-) diff --git a/src/python/gudhi/bottleneck.cc b/src/python/gudhi/bottleneck.cc index 8a3d669a..3a0fe473 100644 --- a/src/python/gudhi/bottleneck.cc +++ b/src/python/gudhi/bottleneck.cc @@ -12,6 +12,9 @@ #include +// Indices are added internally in bottleneck_distance, they are not needed in the input. +static auto make_point(double x, double y, py::ssize_t) { return std::pair(x, y); }; + // For compatibility with older versions, we want to support e=None. // In C++17, the recommended way is std::optional. double bottleneck(Dgm d1, Dgm d2, py::object epsilon) @@ -19,8 +22,8 @@ double bottleneck(Dgm d1, Dgm d2, py::object epsilon) double e = (std::numeric_limits::min)(); if (!epsilon.is_none()) e = epsilon.cast(); // I *think* the call to request() has to be before releasing the GIL. - auto diag1 = numpy_to_range_of_pairs(d1); - auto diag2 = numpy_to_range_of_pairs(d2); + auto diag1 = numpy_to_range_of_pairs(d1, make_point); + auto diag2 = numpy_to_range_of_pairs(d2, make_point); py::gil_scoped_release release; diff --git a/src/python/gudhi/hera/bottleneck.cc b/src/python/gudhi/hera/bottleneck.cc index ec461f7c..6b919b02 100644 --- a/src/python/gudhi/hera/bottleneck.cc +++ b/src/python/gudhi/hera/bottleneck.cc @@ -16,13 +16,16 @@ using py::ssize_t; #endif -#include // Hera +#include + +// Indices are added internally in bottleneck_distance, they are not needed in the input. +static auto make_point(double x, double y, py::ssize_t) { return std::pair(x, y); }; double bottleneck_distance(Dgm d1, Dgm d2, double delta) { // I *think* the call to request() has to be before releasing the GIL. - auto diag1 = numpy_to_range_of_pairs(d1); - auto diag2 = numpy_to_range_of_pairs(d2); + auto diag1 = numpy_to_range_of_pairs(d1, make_point); + auto diag2 = numpy_to_range_of_pairs(d2, make_point); py::gil_scoped_release release; diff --git a/src/python/gudhi/hera/wasserstein.cc b/src/python/gudhi/hera/wasserstein.cc index 3516352e..8c731530 100644 --- a/src/python/gudhi/hera/wasserstein.cc +++ b/src/python/gudhi/hera/wasserstein.cc @@ -16,27 +16,118 @@ using py::ssize_t; #endif -#include // Hera +#include +#include -double wasserstein_distance( +// Unlike bottleneck, for wasserstein, we need to add the index ourselves (if we want the matching) +static auto make_hera_point(double x, double y, py::ssize_t i) { return hera::DiagramPoint(x, y, i); }; + +py::object wasserstein_distance( Dgm d1, Dgm d2, double wasserstein_power, double internal_p, - double delta) + double delta, bool return_matching) { // I *think* the call to request() has to be before releasing the GIL. - auto diag1 = numpy_to_range_of_pairs(d1); - auto diag2 = numpy_to_range_of_pairs(d2); - - py::gil_scoped_release release; - - hera::AuctionParams params; - params.wasserstein_power = wasserstein_power; - // hera encodes infinity as -1... - if(std::isinf(internal_p)) internal_p = hera::get_infinity(); - params.internal_p = internal_p; - params.delta = delta; - // The extra parameters are purposely not exposed for now. - return hera::wasserstein_dist(diag1, diag2, params); + auto diag1 = numpy_to_range_of_pairs(d1, make_hera_point); + auto diag2 = numpy_to_range_of_pairs(d2, make_hera_point); + int n1 = boost::size(diag1); + int n2 = boost::size(diag2); + hera::AuctionResult res; + double dist; + + { // No Python allowed in this section + py::gil_scoped_release release; + + hera::AuctionParams params; + params.wasserstein_power = wasserstein_power; + // hera encodes infinity as -1... + if(std::isinf(internal_p)) internal_p = hera::get_infinity(); + params.internal_p = internal_p; + params.delta = delta; + if(return_matching) { + params.return_matching = true; + params.match_inf_points = true; + } + // The extra parameters are purposely not exposed for now. + res = hera::wasserstein_cost_detailed(diag1, diag2, params); + dist = std::pow(res.cost, 1./params.wasserstein_power); + } + + if(!return_matching) + return py::cast(dist); + + if(dist == std::numeric_limits::infinity()) + return py::make_tuple(dist, py::none()); + + // bug in Hera, matching_a_to_b_ is empty if one diagram is empty or both diagrams contain the same points + if(res.matching_a_to_b_.size() == 0) { + if(n1 == 0) { // diag1 is empty + py::array_t matching({{ n2, 2 }}, nullptr); + auto m = matching.mutable_unchecked(); + for(int j=0; j matching({{ n1, 2 }}, nullptr); + auto m = matching.mutable_unchecked(); + for(int i=0; i matching({{ n1, 2 }}, nullptr); + auto m = matching.mutable_unchecked(); + for(int i=0; i matching({{ n1 + n2, 2 }}, nullptr); + auto m = matching.mutable_unchecked(); + int cur = 0; + for(auto x : res.matching_a_to_b_){ + if(x.first < 0) { + if(x.second < 0) { + } else { + m(cur, 0) = -1; + m(cur, 1) = x.second; + ++cur; + } + } else { + if(x.second < 0) { + m(cur, 0) = x.first; + m(cur, 1) = -1; + ++cur; + } else { + m(cur, 0) = x.first; + m(cur, 1) = x.second; + ++cur; + } + } + } + // n1+n2 was too much, it only happens if everything matches to the diagonal, so we return matching[:cur,:] + py::array_t ret({{ cur, 2 }}, {{ matching.strides()[0], matching.strides()[1] }}, matching.data(), matching); + return py::make_tuple(dist, ret); } PYBIND11_MODULE(wasserstein, m) { @@ -45,6 +136,7 @@ PYBIND11_MODULE(wasserstein, m) { py::arg("order") = 1, py::arg("internal_p") = std::numeric_limits::infinity(), py::arg("delta") = .01, + py::arg("matching") = false, R"pbdoc( Compute the Wasserstein distance between two diagrams. Points at infinity are supported. @@ -55,8 +147,9 @@ PYBIND11_MODULE(wasserstein, m) { order (float): Wasserstein exponent W_q internal_p (float): Internal Minkowski norm L^p in R^2 delta (float): Relative error 1+delta + matching (bool): if ``True``, computes and returns the optimal matching between X and Y, encoded as a (n x 2) np.array [...[i,j]...], meaning the i-th point in X is matched to the j-th point in Y, with the convention that (-1) represents the diagonal. Returns: - float: Approximate Wasserstein distance W_q(X,Y) + float|Tuple[float,numpy.array]: Approximate Wasserstein distance W_q(X,Y), and optionally the corresponding matching )pbdoc"); } diff --git a/src/python/include/pybind11_diagram_utils.h b/src/python/include/pybind11_diagram_utils.h index 2d5194f4..5cb7c48b 100644 --- a/src/python/include/pybind11_diagram_utils.h +++ b/src/python/include/pybind11_diagram_utils.h @@ -17,16 +17,9 @@ namespace py = pybind11; typedef py::array_t Dgm; -// Get m[i,0] and m[i,1] as a pair -static auto pairify(void* p, py::ssize_t h, py::ssize_t w) { - return [=](py::ssize_t i){ - char* birth = (char*)p + i * h; - char* death = birth + w; - return std::make_pair(*(double*)birth, *(double*)death); - }; -} - -inline auto numpy_to_range_of_pairs(py::array_t dgm) { +// build_point(double birth, double death, ssize_t index) -> Point +template +inline auto numpy_to_range_of_pairs(py::array_t dgm, BuildPoint build_point) { py::buffer_info buf = dgm.request(); // shape (n,2) or (0) for empty if((buf.ndim!=2 || buf.shape[1]!=2) && (buf.ndim!=1 || buf.shape[0]!=0)) @@ -34,6 +27,16 @@ inline auto numpy_to_range_of_pairs(py::array_t dgm) { // In the case of shape (0), avoid reading non-existing strides[1] even if we won't use it. py::ssize_t stride1 = buf.ndim == 2 ? buf.strides[1] : 0; auto cnt = boost::counting_range(0, buf.shape[0]); - return boost::adaptors::transform(cnt, pairify(buf.ptr, buf.strides[0], stride1)); + + char* p = static_cast(buf.ptr); + auto h = buf.strides[0]; + auto w = stride1; + // Get m[i,0] and m[i,1] as a pair + auto pairify = [=](py::ssize_t i){ + char* birth = p + i * h; + char* death = birth + w; + return build_point(*(double*)birth, *(double*)death, i); + }; + return boost::adaptors::transform(cnt, pairify); // Be careful that the returned range cannot contain references to dead temporaries. } diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py index 3a004d77..1cac3e1a 100755 --- a/src/python/test/test_wasserstein_distance.py +++ b/src/python/test/test_wasserstein_distance.py @@ -140,9 +140,10 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat if test_matching: match = wasserstein_distance(emptydiag, emptydiag, matching=True, internal_p=1., order=2)[1] - assert np.array_equal(match, []) + # Accept [] or np.array of shape (2, 0) + assert len(match) == 0 match = wasserstein_distance(emptydiag, emptydiag, matching=True, internal_p=np.inf, order=2.24)[1] - assert np.array_equal(match, []) + assert len(match) == 0 match = wasserstein_distance(emptydiag, diag2, matching=True, internal_p=np.inf, order=2.)[1] assert np.array_equal(match , [[-1, 0], [-1, 1]]) match = wasserstein_distance(diag2, emptydiag, matching=True, internal_p=np.inf, order=2.24)[1] @@ -171,10 +172,10 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat assert (match is None) cost, match = wasserstein_distance(diag9, diag10, matching=True, internal_p=1., order=1.) assert (cost == 1) - assert (match == [[0, -1],[1, -1],[-1, 0], [-1, 1], [-1, 2]]) # type 4 and 5 are match to the diag anyway. + assert {(i,j) for i,j in match} == {(0, -1),(1, -1),(-1, 0), (-1, 1), (-1, 2)} # type 4 and 5 are match to the diag anyway. cost, match = wasserstein_distance(diag9, emptydiag, matching=True, internal_p=2., order=2.) assert (cost == 0.) - assert (match == [[0, -1], [1, -1]]) + assert np.array_equal(match, [[0, -1], [1, -1]]) def hera_wrap(**extra): @@ -195,6 +196,6 @@ def test_wasserstein_distance_pot(): def test_wasserstein_distance_hera(): - _basic_wasserstein(hera_wrap(delta=1e-12), 1e-12, test_matching=False) - _basic_wasserstein(hera_wrap(delta=.1), .1, test_matching=False) + _basic_wasserstein(hera_wrap(delta=1e-12), 1e-12, test_matching=True) + _basic_wasserstein(hera_wrap(delta=.1), .1, test_matching=True) -- cgit v1.2.3 From 03e20909d4219d177b512b5f798ae5a5552ae17d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 16 Nov 2022 15:18:17 +0100 Subject: Make the test more resilient --- src/python/test/test_wasserstein_distance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py index 1cac3e1a..8700107b 100755 --- a/src/python/test/test_wasserstein_distance.py +++ b/src/python/test/test_wasserstein_distance.py @@ -149,7 +149,7 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat match = wasserstein_distance(diag2, emptydiag, matching=True, internal_p=np.inf, order=2.24)[1] assert np.array_equal(match , [[0, -1], [1, -1]]) match = wasserstein_distance(diag1, diag2, matching=True, internal_p=2., order=2.)[1] - assert np.array_equal(match, [[0, 0], [1, 1], [2, -1]]) + assert {(i,j) for i,j in match} == {(0, 0), (1, 1), (2, -1)} if test_matching and test_infinity: diag7 = np.array([[0, 3], [4, np.inf], [5, np.inf]]) -- cgit v1.2.3 From 50a772551a29fe0178ff05e3390754ac89cf5826 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 16 Nov 2022 16:32:08 +0100 Subject: Tests more resilient to arbitrary order --- src/python/test/test_wasserstein_distance.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py index 8700107b..72514099 100755 --- a/src/python/test/test_wasserstein_distance.py +++ b/src/python/test/test_wasserstein_distance.py @@ -96,6 +96,13 @@ def test_warn_infty(): assert (m is None) +def _to_set(X): + return { (i, j) for i, j in X } + +def _same_permuted(X, Y): + return _to_set(X) == _to_set(Y) + + def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_matching=True): diag1 = np.array([[2.7, 3.7], [9.6, 14.0], [34.2, 34.974]]) diag2 = np.array([[2.8, 4.45], [9.5, 14.1]]) @@ -145,11 +152,11 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat match = wasserstein_distance(emptydiag, emptydiag, matching=True, internal_p=np.inf, order=2.24)[1] assert len(match) == 0 match = wasserstein_distance(emptydiag, diag2, matching=True, internal_p=np.inf, order=2.)[1] - assert np.array_equal(match , [[-1, 0], [-1, 1]]) + assert _same_permuted(match, [[-1, 0], [-1, 1]]) match = wasserstein_distance(diag2, emptydiag, matching=True, internal_p=np.inf, order=2.24)[1] - assert np.array_equal(match , [[0, -1], [1, -1]]) + assert _same_permuted(match, [[0, -1], [1, -1]]) match = wasserstein_distance(diag1, diag2, matching=True, internal_p=2., order=2.)[1] - assert {(i,j) for i,j in match} == {(0, 0), (1, 1), (2, -1)} + assert _same_permuted(match, [[0, 0], [1, 1], [2, -1]]) if test_matching and test_infinity: diag7 = np.array([[0, 3], [4, np.inf], [5, np.inf]]) @@ -158,7 +165,7 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat diag10 = np.array([[0,1], [-np.inf, -np.inf], [np.inf, np.inf]]) match = wasserstein_distance(diag5, diag6, matching=True, internal_p=2., order=2.)[1] - assert np.array_equal(match, [[0, -1], [-1,0], [-1, 1], [1, 2]]) + assert _same_permuted(match, [[0, -1], [-1,0], [-1, 1], [1, 2]]) match = wasserstein_distance(diag5, diag7, matching=True, internal_p=2., order=2.)[1] assert (match is None) cost, match = wasserstein_distance(diag7, emptydiag, matching=True, internal_p=2., order=2.3) @@ -172,10 +179,10 @@ def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_mat assert (match is None) cost, match = wasserstein_distance(diag9, diag10, matching=True, internal_p=1., order=1.) assert (cost == 1) - assert {(i,j) for i,j in match} == {(0, -1),(1, -1),(-1, 0), (-1, 1), (-1, 2)} # type 4 and 5 are match to the diag anyway. + assert _same_permuted(match, [[0, -1],[1, -1],[-1, 0], [-1, 1], [-1, 2]]) # type 4 and 5 are match to the diag anyway. cost, match = wasserstein_distance(diag9, emptydiag, matching=True, internal_p=2., order=2.) assert (cost == 0.) - assert np.array_equal(match, [[0, -1], [1, -1]]) + assert _same_permuted(match, [[0, -1], [1, -1]]) def hera_wrap(**extra): -- cgit v1.2.3 From 8226f645e01c11ccfc7919327bb281b6f805bfa4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 10 Dec 2022 20:40:24 +0100 Subject: Use std::optional for possibly None argument. --- src/python/gudhi/bottleneck.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/python/gudhi/bottleneck.cc b/src/python/gudhi/bottleneck.cc index 3a0fe473..1cf3283c 100644 --- a/src/python/gudhi/bottleneck.cc +++ b/src/python/gudhi/bottleneck.cc @@ -9,18 +9,17 @@ */ #include - +#include #include +#include // Indices are added internally in bottleneck_distance, they are not needed in the input. static auto make_point(double x, double y, py::ssize_t) { return std::pair(x, y); }; // For compatibility with older versions, we want to support e=None. -// In C++17, the recommended way is std::optional. -double bottleneck(Dgm d1, Dgm d2, py::object epsilon) +double bottleneck(Dgm d1, Dgm d2, std::optional epsilon) { - double e = (std::numeric_limits::min)(); - if (!epsilon.is_none()) e = epsilon.cast(); + double e = epsilon.value_or((std::numeric_limits::min)()); // I *think* the call to request() has to be before releasing the GIL. auto diag1 = numpy_to_range_of_pairs(d1, make_point); auto diag2 = numpy_to_range_of_pairs(d2, make_point); -- cgit v1.2.3 From 33b63b9d364bdb2d470a3eac878fe48e3a4e7da6 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 10 Dec 2022 21:05:25 +0100 Subject: Clarify comment --- src/python/gudhi/bottleneck.cc | 2 +- src/python/gudhi/hera/bottleneck.cc | 2 +- src/python/gudhi/hera/wasserstein.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/gudhi/bottleneck.cc b/src/python/gudhi/bottleneck.cc index 1cf3283c..040e6d37 100644 --- a/src/python/gudhi/bottleneck.cc +++ b/src/python/gudhi/bottleneck.cc @@ -20,7 +20,7 @@ static auto make_point(double x, double y, py::ssize_t) { return std::pair(x, y) double bottleneck(Dgm d1, Dgm d2, std::optional epsilon) { double e = epsilon.value_or((std::numeric_limits::min)()); - // I *think* the call to request() has to be before releasing the GIL. + // I *think* the call to request() in numpy_to_range_of_pairs has to be before releasing the GIL. auto diag1 = numpy_to_range_of_pairs(d1, make_point); auto diag2 = numpy_to_range_of_pairs(d2, make_point); diff --git a/src/python/gudhi/hera/bottleneck.cc b/src/python/gudhi/hera/bottleneck.cc index 6b919b02..9826252c 100644 --- a/src/python/gudhi/hera/bottleneck.cc +++ b/src/python/gudhi/hera/bottleneck.cc @@ -23,7 +23,7 @@ static auto make_point(double x, double y, py::ssize_t) { return std::pair(x, y) double bottleneck_distance(Dgm d1, Dgm d2, double delta) { - // I *think* the call to request() has to be before releasing the GIL. + // I *think* the call to request() in numpy_to_range_of_pairs has to be before releasing the GIL. auto diag1 = numpy_to_range_of_pairs(d1, make_point); auto diag2 = numpy_to_range_of_pairs(d2, make_point); diff --git a/src/python/gudhi/hera/wasserstein.cc b/src/python/gudhi/hera/wasserstein.cc index 8c731530..77c1413a 100644 --- a/src/python/gudhi/hera/wasserstein.cc +++ b/src/python/gudhi/hera/wasserstein.cc @@ -27,7 +27,7 @@ py::object wasserstein_distance( double wasserstein_power, double internal_p, double delta, bool return_matching) { - // I *think* the call to request() has to be before releasing the GIL. + // I *think* the call to request() in numpy_to_range_of_pairs has to be before releasing the GIL. auto diag1 = numpy_to_range_of_pairs(d1, make_hera_point); auto diag2 = numpy_to_range_of_pairs(d2, make_hera_point); int n1 = boost::size(diag1); -- cgit v1.2.3 From eb6c94ea1125bf216ec5f07b2936dd115e461aa4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 10 Dec 2022 21:21:57 +0100 Subject: Document None as matching in the infinite distance case --- src/python/gudhi/hera/wasserstein.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/gudhi/hera/wasserstein.cc b/src/python/gudhi/hera/wasserstein.cc index 77c1413a..41e84f7b 100644 --- a/src/python/gudhi/hera/wasserstein.cc +++ b/src/python/gudhi/hera/wasserstein.cc @@ -147,9 +147,9 @@ PYBIND11_MODULE(wasserstein, m) { order (float): Wasserstein exponent W_q internal_p (float): Internal Minkowski norm L^p in R^2 delta (float): Relative error 1+delta - matching (bool): if ``True``, computes and returns the optimal matching between X and Y, encoded as a (n x 2) np.array [...[i,j]...], meaning the i-th point in X is matched to the j-th point in Y, with the convention that (-1) represents the diagonal. + matching (bool): if ``True``, computes and returns the optimal matching between X and Y, encoded as a (n x 2) np.array [...[i,j]...], meaning the i-th point in X is matched to the j-th point in Y, with the convention that (-1) represents the diagonal. If the distance between two diagrams is +inf (which happens if the cardinalities of essential parts differ) and the matching is requested, it will be set to ``None`` (any matching is optimal). Returns: - float|Tuple[float,numpy.array]: Approximate Wasserstein distance W_q(X,Y), and optionally the corresponding matching + float|Tuple[float,numpy.array|None]: Approximate Wasserstein distance W_q(X,Y), and optionally the corresponding matching )pbdoc"); } -- cgit v1.2.3