summaryrefslogtreecommitdiff
path: root/src/python/gudhi
diff options
context:
space:
mode:
authorThéo Lacombe <lacombe1993@gmail.com>2020-06-29 10:24:44 +0200
committerGitHub <noreply@github.com>2020-06-29 10:24:44 +0200
commit0b4de61a18bc30f66a7fb45cc246cff2f55ba1a1 (patch)
tree47527fd4d63632f3c39a6f2660ec141417f093b6 /src/python/gudhi
parent6c65d29acc3b03d21beca653834340787bf0c65e (diff)
parentcec4a5d7df6d5ed43511e94f9db580489979105a (diff)
Merge branch 'master' into fix342
Diffstat (limited to 'src/python/gudhi')
-rw-r--r--src/python/gudhi/alpha_complex.pyx49
-rw-r--r--src/python/gudhi/bottleneck.cc15
-rw-r--r--src/python/gudhi/clustering/__init__.py0
-rw-r--r--src/python/gudhi/clustering/_tomato.cc277
-rw-r--r--src/python/gudhi/clustering/tomato.py321
-rw-r--r--src/python/gudhi/dtm_rips_complex.py51
-rw-r--r--src/python/gudhi/hera/__init__.py7
-rw-r--r--src/python/gudhi/hera/bottleneck.cc54
-rw-r--r--src/python/gudhi/hera/wasserstein.cc (renamed from src/python/gudhi/hera.cc)2
-rw-r--r--src/python/gudhi/representations/kernel_methods.py41
-rw-r--r--src/python/gudhi/representations/metrics.py57
11 files changed, 816 insertions, 58 deletions
diff --git a/src/python/gudhi/alpha_complex.pyx b/src/python/gudhi/alpha_complex.pyx
index d75e374a..a356384d 100644
--- a/src/python/gudhi/alpha_complex.pyx
+++ b/src/python/gudhi/alpha_complex.pyx
@@ -27,11 +27,11 @@ __license__ = "GPL v3"
cdef extern from "Alpha_complex_interface.h" namespace "Gudhi":
cdef cppclass Alpha_complex_interface "Gudhi::alpha_complex::Alpha_complex_interface":
- Alpha_complex_interface(vector[vector[double]] points) nogil except +
+ Alpha_complex_interface(vector[vector[double]] points, bool fast_version) nogil except +
# bool from_file is a workaround for cython to find the correct signature
- Alpha_complex_interface(string off_file, bool from_file) nogil except +
+ Alpha_complex_interface(string off_file, bool fast_version, bool from_file) nogil except +
vector[double] get_point(int vertex) nogil except +
- void create_simplex_tree(Simplex_tree_interface_full_featured* simplex_tree, double max_alpha_square) nogil except +
+ void create_simplex_tree(Simplex_tree_interface_full_featured* simplex_tree, double max_alpha_square, bool exact_version, bool default_filtration_value) nogil except +
# AlphaComplex python interface
cdef class AlphaComplex:
@@ -53,10 +53,11 @@ cdef class AlphaComplex:
"""
- cdef Alpha_complex_interface * thisptr
+ cdef Alpha_complex_interface * this_ptr
+ cdef bool exact
# Fake constructor that does nothing but documenting the constructor
- def __init__(self, points=None, off_file=''):
+ def __init__(self, points=None, off_file='', precision='safe'):
"""AlphaComplex constructor.
:param points: A list of points in d-Dimension.
@@ -66,15 +67,21 @@ cdef class AlphaComplex:
:param off_file: An OFF file style name.
:type off_file: string
+
+ :param precision: Alpha complex precision can be 'fast', 'safe' or 'exact'. Default is 'safe'.
+ :type precision: string
"""
# The real cython constructor
- def __cinit__(self, points = None, off_file = ''):
+ def __cinit__(self, points = None, off_file = '', precision = 'safe'):
+ assert precision in ['fast', 'safe', 'exact'], "Alpha complex precision can only be 'fast', 'safe' or 'exact'"
+ cdef bool fast = precision == 'fast'
+ self.exact = precision == 'exact'
+
cdef vector[vector[double]] pts
if off_file:
if os.path.isfile(off_file):
- self.thisptr = new Alpha_complex_interface(
- off_file.encode('utf-8'), True)
+ self.this_ptr = new Alpha_complex_interface(off_file.encode('utf-8'), fast, True)
else:
print("file " + off_file + " not found.")
else:
@@ -83,17 +90,16 @@ cdef class AlphaComplex:
points=[]
pts = points
with nogil:
- self.thisptr = new Alpha_complex_interface(pts)
-
+ self.this_ptr = new Alpha_complex_interface(pts, fast)
def __dealloc__(self):
- if self.thisptr != NULL:
- del self.thisptr
+ if self.this_ptr != NULL:
+ del self.this_ptr
def __is_defined(self):
"""Returns true if AlphaComplex pointer is not NULL.
"""
- return self.thisptr != NULL
+ return self.this_ptr != NULL
def get_point(self, vertex):
"""This function returns the point corresponding to a given vertex.
@@ -103,21 +109,24 @@ cdef class AlphaComplex:
:rtype: list of float
:returns: the point.
"""
- return self.thisptr.get_point(vertex)
+ return self.this_ptr.get_point(vertex)
- def create_simplex_tree(self, max_alpha_square = float('inf')):
+ def create_simplex_tree(self, max_alpha_square = float('inf'), default_filtration_value = False):
"""
- :param max_alpha_square: The maximum alpha square threshold the
- simplices shall not exceed. Default is set to infinity, and
- there is very little point using anything else since it does
- not save time.
+ :param max_alpha_square: The maximum alpha square threshold the simplices shall not exceed. Default is set to
+ infinity, and there is very little point using anything else since it does not save time.
:type max_alpha_square: float
+ :param default_filtration_value: Set this value to `True` if filtration values are not needed to be computed
+ (will be set to `NaN`). Default value is `False` (which means compute the filtration values).
+ :type default_filtration_value: bool
:returns: A simplex tree created from the Delaunay Triangulation.
:rtype: SimplexTree
"""
stree = SimplexTree()
cdef double mas = max_alpha_square
cdef intptr_t stree_int_ptr=stree.thisptr
+ cdef bool compute_filtration = default_filtration_value == True
with nogil:
- self.thisptr.create_simplex_tree(<Simplex_tree_interface_full_featured*>stree_int_ptr, mas)
+ self.this_ptr.create_simplex_tree(<Simplex_tree_interface_full_featured*>stree_int_ptr,
+ mas, self.exact, compute_filtration)
return stree
diff --git a/src/python/gudhi/bottleneck.cc b/src/python/gudhi/bottleneck.cc
index 732cb9a8..8a3d669a 100644
--- a/src/python/gudhi/bottleneck.cc
+++ b/src/python/gudhi/bottleneck.cc
@@ -12,24 +12,29 @@
#include <pybind11_diagram_utils.h>
-double bottleneck(Dgm d1, Dgm d2, double epsilon)
+// For compatibility with older versions, we want to support e=None.
+// In C++17, the recommended way is std::optional<double>.
+double bottleneck(Dgm d1, Dgm d2, py::object epsilon)
{
+ double e = (std::numeric_limits<double>::min)();
+ if (!epsilon.is_none()) e = epsilon.cast<double>();
// I *think* the call to request() has to be before releasing the GIL.
auto diag1 = numpy_to_range_of_pairs(d1);
auto diag2 = numpy_to_range_of_pairs(d2);
py::gil_scoped_release release;
- return Gudhi::persistence_diagram::bottleneck_distance(diag1, diag2, epsilon);
+ return Gudhi::persistence_diagram::bottleneck_distance(diag1, diag2, e);
}
PYBIND11_MODULE(bottleneck, m) {
m.attr("__license__") = "GPL v3";
m.def("bottleneck_distance", &bottleneck,
py::arg("diagram_1"), py::arg("diagram_2"),
- py::arg("e") = (std::numeric_limits<double>::min)(),
+ py::arg("e") = py::none(),
R"pbdoc(
- This function returns the point corresponding to a given vertex.
+ Compute the Bottleneck distance between two diagrams.
+ Points at infinity and on the diagonal are supported.
:param diagram_1: The first diagram.
:type diagram_1: numpy array of shape (m,2)
@@ -42,7 +47,7 @@ PYBIND11_MODULE(bottleneck, m) {
bits of the mantissa may be wrong). This version of the algorithm takes
advantage of the limited precision of `double` and is usually a lot
faster to compute, whatever the value of `e`.
- Thus, by default, `e` is the smallest positive double.
+ Thus, by default (`e=None`), `e` is the smallest positive double.
:type e: float
:rtype: float
:returns: the bottleneck distance.
diff --git a/src/python/gudhi/clustering/__init__.py b/src/python/gudhi/clustering/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/python/gudhi/clustering/__init__.py
diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc
new file mode 100644
index 00000000..a76a2c3a
--- /dev/null
+++ b/src/python/gudhi/clustering/_tomato.cc
@@ -0,0 +1,277 @@
+/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+ * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+ * Author(s): Marc Glisse
+ *
+ * Copyright (C) 2020 Inria
+ *
+ * Modification(s):
+ * - YYYY/MM Author: Description of the modification
+ */
+
+#include <boost/container/flat_map.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/property_map/property_map.hpp>
+#include <boost/property_map/transform_value_property_map.hpp>
+#include <boost/property_map/vector_property_map.hpp>
+#include <boost/property_map/function_property_map.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <vector>
+#include <unordered_map>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <class T, class = std::enable_if_t<std::is_integral<T>::value>>
+int getint(int i) {
+ return i;
+}
+// Gcc-8 has a bug that breaks this version, fixed in gcc-9
+// template<class T, class=decltype(std::declval<T>().template cast<int>())>
+// int getint(T i){return i.template cast<int>();}
+template <class T>
+auto getint(T i) -> decltype(i.template cast<int>()) {
+ return i.template cast<int>();
+}
+
+// Raw clusters are clusters obtained through single-linkage, no merging.
+
+typedef int Point_index;
+typedef int Cluster_index;
+struct Merge {
+ Cluster_index first, second;
+ double persist;
+};
+
+template <class Neighbors, class Density, class Order, class ROrder>
+auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order,
+ ROrder const& rorder) {
+ // point index --> index of raw cluster it belongs to
+ std::vector<Cluster_index> raw_cluster;
+ raw_cluster.reserve(num_points);
+ // cluster index --> index of top point in the cluster
+ Cluster_index n_raw_clusters = 0; // current number of raw clusters seen
+ //
+ std::vector<Merge> merges;
+ struct Data {
+ Cluster_index parent;
+ int rank;
+ Point_index max;
+ }; // information on a cluster
+ std::vector<Data> ds_base;
+ // boost::vector_property_map does resize(size+1) for every new element, don't use it
+ auto ds_data =
+ boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n) -> Data& { return ds_base[n]; });
+ auto ds_parent =
+ boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data);
+ auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data);
+ boost::disjoint_sets<decltype(ds_rank), decltype(ds_parent)> ds(
+ ds_rank, ds_parent); // on the clusters, not directly the points
+ std::vector<std::array<double, 2>> persistence; // diagram (finite points)
+ boost::container::flat_map<Cluster_index, Cluster_index>
+ adj_clusters; // first: the merged cluster, second: the raw cluster
+ // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only
+ // insert in the vector if merged is absent from the set
+
+ for (Point_index i = 0; i < num_points; ++i) {
+ // auto&& ngb = neighbors[order[i]];
+ // TODO: have a specialization where we don't need python types and py::cast
+ // TODO: move py::cast and getint into Neighbors
+ py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work
+ adj_clusters.clear();
+ Point_index j = i; // highest neighbor
+ // for(Point_index k : ngb)
+ for (auto k_py : ngb) {
+ Point_index k = rorder[getint(k_py)];
+ if (k >= i || k < 0) // ???
+ continue;
+ if (k < j) j = k;
+ Cluster_index rk = raw_cluster[k];
+ adj_clusters.emplace(ds.find_set(rk), rk);
+ // does not insert if ck=ds.find_set(rk) already seen
+ // which rk we keep from those with the same ck is arbitrary
+ }
+ assert((Point_index)raw_cluster.size() == i);
+ if (i == j) { // local maximum -> new cluster
+ Cluster_index c = n_raw_clusters++;
+ ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access
+ ds.make_set(c);
+ ds_base[c].max = i; // max
+ raw_cluster.push_back(c);
+ continue;
+ } else { // add i to the cluster of j
+ assert(j < i);
+ raw_cluster.push_back(raw_cluster[j]);
+ // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may
+ // merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems.
+ // We could just not set j at all and use arbitrarily the first element of adj_clusters.
+ }
+ // possibly merge clusters
+ // we could sort, in case there are several merges, but that doesn't seem so useful
+ Cluster_index rj = raw_cluster[j];
+ Cluster_index cj = ds.find_set(rj);
+ Cluster_index orig_cj = cj;
+ for (auto ckk : adj_clusters) {
+ Cluster_index rk = ckk.second;
+ Cluster_index ck = ckk.first;
+ if (ck == orig_cj) continue;
+ assert(ck == ds.find_set(rk));
+ Point_index j = ds_base[cj].max;
+ Point_index k = ds_base[ck].max;
+ Point_index young = std::max(j, k);
+ Point_index old = std::min(j, k);
+ auto d_young = density[order[young]];
+ auto d_i = density[order[i]];
+ assert(d_young >= d_i);
+ // Always merge (the non-hierarchical algorithm would only conditionally merge here
+ persistence.push_back({d_young, d_i});
+ assert(ds.find_set(rj) != ds.find_set(rk));
+ ds.link(cj, ck);
+ cj = ds.find_set(cj);
+ ds_base[cj].max = old; // just one parent, no need for find_set
+ // record the raw clusters, we don't know what will have already been merged.
+ merges.push_back({rj, rk, d_young - d_i});
+ }
+ }
+ {
+ boost::counting_iterator<int> b(0), e(ds_base.size());
+ ds.compress_sets(b, e);
+ // Now we stop using find_sets and look at the parent directly
+ // rank is reused to rename clusters contiguously 0, 1, etc
+ }
+ // Maximum for each connected component
+ std::vector<double> max_cc;
+ for (Cluster_index i = 0; i < n_raw_clusters; ++i) {
+ if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]);
+ }
+ assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters);
+
+ // TODO: create a "noise" cluster, merging all those not prominent enough?
+
+ // Replay the merges, in increasing order of prominence, to build the hierarchy
+ std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; });
+ std::vector<std::array<Cluster_index, 2>> children;
+ children.reserve(merges.size());
+ {
+ struct Dat {
+ Cluster_index parent;
+ int rank;
+ Cluster_index name;
+ };
+ std::vector<Dat> ds_bas(2 * n_raw_clusters - 1);
+ Cluster_index i;
+ auto ds_dat =
+ boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+ auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+ auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+ boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
+ for (i = 0; i < n_raw_clusters; ++i) {
+ ds.make_set(i);
+ ds_bas[i].name = i;
+ }
+ for (Merge const& m : merges) {
+ Cluster_index j = ds.find_set(m.first);
+ Cluster_index k = ds.find_set(m.second);
+ assert(j != k);
+ children.push_back({ds_bas[j].name, ds_bas[k].name});
+ ds.make_set(i);
+ ds.link(i, j);
+ ds.link(i, k);
+ ds_bas[ds.find_set(i)].name = i;
+ ++i;
+ }
+ }
+
+ std::vector<Cluster_index> raw_cluster_ordered(num_points);
+ for (int i = 0; i < num_points; ++i) raw_cluster_ordered[i] = raw_cluster[rorder[i]];
+ // return raw_cluster, children, persistence
+ // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042
+ return py::make_tuple(py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
+ py::array(children.size(), children.data()), py::array(persistence.size(), persistence.data()),
+ py::array(max_cc.size(), max_cc.data()));
+}
+
+auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final) {
+ if (n_final > n_leaves) {
+ std::cerr << "The number of clusters required " << n_final << " is larger than the number of mini-clusters " << n_leaves << '\n';
+ n_final = n_leaves; // or return something special and let Tomato use leaf_labels_?
+ }
+ py::buffer_info cbuf = children.request();
+ if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0))
+ throw std::runtime_error("internal error: children have to be (n,2) or empty");
+ const int n_merges = cbuf.shape[0];
+ Cluster_index* d = (Cluster_index*)cbuf.ptr;
+ if (n_merges + n_final < n_leaves) {
+ std::cerr << "The number of clusters required " << n_final << " is smaller than the number of connected components " << n_leaves - n_merges << '\n';
+ n_final = n_leaves - n_merges;
+ }
+ struct Dat {
+ Cluster_index parent;
+ int rank;
+ int name;
+ };
+ std::vector<Dat> ds_bas(2 * n_leaves - 1);
+ auto ds_dat = boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+ auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+ auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+ boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
+ Cluster_index i;
+ for (i = 0; i < n_leaves; ++i) {
+ ds.make_set(i);
+ ds_bas[i].name = -1;
+ }
+ for (Cluster_index m = 0; m < n_leaves - n_final; ++m) {
+ Cluster_index j = ds.find_set(d[2 * m]);
+ Cluster_index k = ds.find_set(d[2 * m + 1]);
+ assert(j != k);
+ ds.make_set(i);
+ ds.link(i, j);
+ ds.link(i, k);
+ ++i;
+ }
+ Cluster_index next_cluster_name = 0;
+ std::vector<Cluster_index> ret;
+ ret.reserve(n_leaves);
+ for (Cluster_index j = 0; j < n_leaves; ++j) {
+ Cluster_index k = ds.find_set(j);
+ if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
+ ret.push_back(ds_bas[k].name);
+ }
+ return py::array(ret.size(), ret.data());
+}
+
+// TODO: Do a special version when ngb is a numpy array, where we can cast to int[k][n] ?
+// py::isinstance<py::array_t<std::int32_t>> (ou py::isinstance<py::array> et tester dtype) et flags&c_style
+// ou overload (en virant forcecast?)
+// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ?
+auto hierarchy(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density) {
+ // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted
+ // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to []
+ // (I _think_ handle is ok and we don't need object here)
+ py::buffer_info wbuf = density.request();
+ if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D");
+ const int n = wbuf.shape[0];
+ double* d = (double*)wbuf.ptr;
+ // Vector { 0, 1, ..., n-1 }
+ std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n));
+ // Permutation of the indices to get points in decreasing order of density
+ std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; });
+ // Inverse permutation
+ std::vector<Point_index> rorder(n);
+ for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
+ // Used as:
+ // order[i] is the index of the point with i-th largest density
+ // rorder[i] is the rank of the i-th point in order of decreasing density
+ // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the
+ // output)
+ return tomato(n, ngb, d, order, rorder);
+}
+
+PYBIND11_MODULE(_tomato, m) {
+ m.doc() = "Internals of tomato clustering";
+ m.def("hierarchy", &hierarchy, "does the clustering");
+ m.def("merge", &merge, "merge clusters");
+}
diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
new file mode 100644
index 00000000..fbba3cc8
--- /dev/null
+++ b/src/python/gudhi/clustering/tomato.py
@@ -0,0 +1,321 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+import numpy
+from ..point_cloud.knn import KNearestNeighbors
+from ..point_cloud.dtm import DTMDensity
+from ._tomato import *
+
+# The fit/predict interface is not so well suited...
+
+
+class Tomato:
+ """
+ This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point.
+ A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural
+ to provide your own.
+
+ :Requires: `SciPy <installation.html#scipy>`_, `Scikit-learn <installation.html#scikit-learn>`_ or others
+ (see :class:`~gudhi.point_cloud.knn.KNearestNeighbors`) in function of the options.
+
+ Attributes
+ ----------
+ n_clusters_: int
+ The number of clusters. Writing to it automatically adjusts `labels_`.
+ merge_threshold_: float
+ minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts `labels_`.
+ n_leaves_: int
+ number of leaves (unstable clusters) in the hierarchical tree
+ leaf_labels_: ndarray of shape (n_samples,)
+ cluster labels for each point, at the very bottom of the hierarchy
+ labels_: ndarray of shape (n_samples,)
+ cluster labels for each point, after merging
+ diagram_: ndarray of shape (`n_leaves_`, 2)
+ persistence diagram (only the finite points)
+ max_weight_per_cc_: ndarray of shape (n_connected_components,)
+ maximum of the density function on each connected component. This corresponds to the abscissa of infinite
+ points in the diagram
+ children_: ndarray of shape (`n_leaves_`-n_connected_components, 2)
+ The children of each non-leaf node. Values less than `n_leaves_` correspond to leaves of the tree.
+ A node i greater than or equal to `n_leaves_` is a non-leaf node and has children children_[i - `n_leaves_`].
+ Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_leaves_` + i
+ weights_: ndarray of shape (n_samples,)
+ weights of the points, as computed by the density estimator or provided by the user
+ params_: dict
+ Parameters like metric, etc
+ """
+
+ def __init__(
+ self,
+ graph_type="knn",
+ density_type="logDTM",
+ n_clusters=None,
+ merge_threshold=None,
+ # eliminate_threshold=None,
+ # eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated
+ **params
+ ):
+ """
+ Args:
+ graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'.
+ density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points,
+ 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'.
+ metric (str|Callable): metric used when calculating the distance between instances in a feature array.
+ Defaults to Minkowski of parameter p.
+ kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to
+ sklearn.neighbors.KernelDensity.
+ k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10.
+ k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself).
+ Defaults to k.
+ r (float): size of a neighborhood if graph_type is 'radius'. Also used as default bandwidth in kde_params.
+ eps (float): (1+eps) approximation factor when computing distances (ignored in many cases).
+ n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get
+ the maximal number of clusters.
+ merge_threshold (float): minimum prominence of a cluster so it doesn't get merged.
+ symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric.
+ This can be useful with k-NN for small k. Defaults to false.
+ p (float): norm L^p on input points. Defaults to 2.
+ q (float): order used to compute the distance to measure. Defaults to dim.
+ Beware that when the dimension is large, this can easily cause overflows.
+ dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the
+ dimension, or 2 when the dimension cannot be read from the input (metric is "precomputed").
+ n_jobs (int): Number of jobs to schedule for parallel processing on the CPU.
+ If -1 is given all processors are used. Default: 1.
+ params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and
+ :class:`~gudhi.point_cloud.dtm.DTMDensity`.
+ """
+ # Should metric='precomputed' mean input_type='distance_matrix'?
+ # Should we be able to pass metric='minkowski' (what None does currently)?
+ self.graph_type_ = graph_type
+ self.density_type_ = density_type
+ self.params_ = params
+ self.__n_clusters = n_clusters
+ self.__merge_threshold = merge_threshold
+ # self.eliminate_threshold_ = eliminate_threshold
+ if n_clusters and merge_threshold:
+ raise ValueError("Cannot specify both a merge threshold and a number of clusters")
+
+ def fit(self, X, y=None, weights=None):
+ """
+ Args:
+ X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points,
+ or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors
+ for each point (points are represented by their index, starting from 0) if graph_type is "manual".
+ The number of points is currently limited to about 2 billion.
+ weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point
+ y: Not used, present here for API consistency with scikit-learn by convention.
+ """
+ # TODO: First detect if this is a new call with the same data (only threshold changed?)
+ # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters.
+ if weights is not None:
+ density_type = "manual"
+ else:
+ density_type = self.density_type_
+ if density_type == "manual":
+ raise ValueError("If density_type is 'manual', you must provide weights to fit()")
+
+ if self.graph_type_ == "manual":
+ self.neighbors_ = X
+ # FIXME: uniformize "message 'option'" vs 'message "option"'
+ assert density_type == "manual", 'If graph_type is "manual", density_type must be as well'
+ else:
+ metric = self.params_.get("metric", "minkowski")
+ if metric != "precomputed":
+ self.points_ = X
+
+ # Slight complication to avoid computing knn twice.
+ need_knn = 0
+ need_knn_ngb = False
+ need_knn_dist = False
+ if self.graph_type_ == "knn":
+ k_graph = self.params_.get("k", 10)
+ # If X has fewer than k points...
+ if k_graph > len(X):
+ k_graph = len(X)
+ need_knn = k_graph
+ need_knn_ngb = True
+ if self.density_type_ in ["DTM", "logDTM"]:
+ k = self.params_.get("k", 10)
+ k_DTM = self.params_.get("k_DTM", k)
+ # If X has fewer than k points...
+ if k_DTM > len(X):
+ k_DTM = len(X)
+ need_knn = max(need_knn, k_DTM)
+ need_knn_dist = True
+ # if we ask for more neighbors for the graph than the DTM, getting the distances is a slight waste,
+ # but it looks negligible
+ if need_knn > 0:
+ knn_args = dict(self.params_)
+ knn_args["k"] = need_knn
+ knn = KNearestNeighbors(return_index=need_knn_ngb, return_distance=need_knn_dist, **knn_args).fit_transform(
+ X
+ )
+ if need_knn_ngb:
+ if need_knn_dist:
+ self.neighbors_ = knn[0][:, 0:k_graph]
+ knn_dist = knn[1]
+ else:
+ self.neighbors_ = knn
+ elif need_knn_dist:
+ knn_dist = knn
+ if self.density_type_ in ["DTM", "logDTM"]:
+ dim = self.params_.get("dim")
+ if dim is None:
+ dim = len(X[0]) if metric != "precomputed" else 2
+ q = self.params_.get("q", dim)
+ weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist)
+ if self.density_type_ == "logDTM":
+ weights = numpy.log(weights)
+
+ if self.graph_type_ == "radius":
+ if metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]:
+ from scipy.spatial import cKDTree
+
+ tree = cKDTree(X)
+ # TODO: handle "l1" and "l2" aliases?
+ p = self.params_.get("p")
+ if metric == "euclidean":
+ assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'"
+ p = 2
+ elif metric == "manhattan":
+ assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'"
+ p = 1
+ elif metric == "chebyshev":
+ assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'"
+ p = numpy.inf
+ elif p is None:
+ p = 2 # the default
+ eps = self.params_.get("eps", 0)
+ self.neighbors_ = tree.query_ball_tree(tree, r=self.params_["r"], p=p, eps=eps)
+
+ # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree
+ # (don't bother with the _graph variant, it just calls radius_neighbors).
+ elif metric != "precomputed":
+ from sklearn.metrics import pairwise_distances
+
+ X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs"))
+ metric = "precomputed"
+
+ if metric == "precomputed":
+ # TODO: parallelize? May not be worth it.
+ X = numpy.asarray(X)
+ r = self.params_["r"]
+ self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X]
+
+ if self.density_type_ in {"KDE", "logKDE"}:
+ # Slow...
+ assert (
+ self.graph_type_ != "manual" and metric != "precomputed"
+ ), "Scikit-learn's KernelDensity requires point coordinates"
+ kde_params = dict(self.params_.get("kde_params", dict()))
+ kde_params.setdefault("metric", metric)
+ r = self.params_.get("r")
+ if r is not None:
+ kde_params.setdefault("bandwidth", r)
+ # Should we default rtol to eps?
+ from sklearn.neighbors import KernelDensity
+
+ weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_)
+ if self.density_type_ == "KDE":
+ weights = numpy.exp(weights)
+
+ # TODO: do it at the C++ level and/or in parallel if this is too slow?
+ if self.params_.get("symmetrize_graph"):
+ self.neighbors_ = [set(line) for line in self.neighbors_]
+ for i, line in enumerate(self.neighbors_):
+ line.discard(i)
+ for j in line:
+ self.neighbors_[j].add(i)
+
+ self.weights_ = weights
+ # This is where the main computation happens
+ self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = hierarchy(self.neighbors_, weights)
+ self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_)
+ assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_)
+ # TODO: deduplicate this code with the setters below
+ if self.__merge_threshold:
+ assert not self.__n_clusters
+ self.__n_clusters = numpy.count_nonzero(
+ self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold
+ ) + len(self.max_weight_per_cc_)
+ if self.__n_clusters:
+ # TODO: set corresponding merge_threshold?
+ renaming = merge(self.children_, self.n_leaves_, self.__n_clusters)
+ self.labels_ = renaming[self.leaf_labels_]
+ # In case the user asked for something impossible.
+ # TODO: check for impossible situations before calling merge.
+ self.__n_clusters = self.labels_.max() + 1
+ else:
+ self.labels_ = self.leaf_labels_
+ self.__n_clusters = self.n_leaves_
+ return self
+
+ def fit_predict(self, X, y=None, weights=None):
+ """
+ Equivalent to fit(), and returns the `labels_`.
+ """
+ return self.fit(X, y, weights).labels_
+
+ # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k?
+ def plot_diagram(self):
+ """
+ """
+ import matplotlib.pyplot as plt
+
+ l = self.max_weight_per_cc_.min()
+ r = self.max_weight_per_cc_.max()
+ if self.diagram_.size > 0:
+ plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro")
+ l = min(l, self.diagram_[:, 1].min())
+ r = max(r, self.diagram_[:, 0].max())
+ if l == r:
+ if l > 0:
+ l, r = 0.9 * l, 1.1 * r
+ elif l < 0:
+ l, r = 1.1 * l, 0.9 * r
+ else:
+ l, r = -1.0, 1.0
+ plt.plot([l, r], [l, r])
+ plt.plot(
+ self.max_weight_per_cc_, numpy.full(self.max_weight_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green"
+ )
+ plt.show()
+
+ # Use set_params instead?
+ @property
+ def n_clusters_(self):
+ return self.__n_clusters
+
+ @n_clusters_.setter
+ def n_clusters_(self, n_clusters):
+ if n_clusters == self.__n_clusters:
+ return
+ self.__n_clusters = n_clusters
+ self.__merge_threshold = None
+ if hasattr(self, "leaf_labels_"):
+ renaming = merge(self.children_, self.n_leaves_, self.__n_clusters)
+ self.labels_ = renaming[self.leaf_labels_]
+ # In case the user asked for something impossible
+ self.__n_clusters = self.labels_.max() + 1
+
+ @property
+ def merge_threshold_(self):
+ return self.__merge_threshold
+
+ @merge_threshold_.setter
+ def merge_threshold_(self, merge_threshold):
+ if merge_threshold == self.__merge_threshold:
+ return
+ if hasattr(self, "leaf_labels_"):
+ self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 0] - self.diagram_[:, 1] > merge_threshold) + len(
+ self.max_weight_per_cc_
+ )
+ else:
+ self.__n_clusters = None
+ self.__merge_threshold = merge_threshold
diff --git a/src/python/gudhi/dtm_rips_complex.py b/src/python/gudhi/dtm_rips_complex.py
new file mode 100644
index 00000000..63c9b138
--- /dev/null
+++ b/src/python/gudhi/dtm_rips_complex.py
@@ -0,0 +1,51 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Yuichi Ike, Raphaël Tinarrage
+#
+# Copyright (C) 2020 Inria, Copyright (C) 2020 FUjitsu Laboratories Ltd.
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+
+from gudhi.weighted_rips_complex import WeightedRipsComplex
+from gudhi.point_cloud.dtm import DistanceToMeasure
+from scipy.spatial.distance import cdist
+
+class DTMRipsComplex(WeightedRipsComplex):
+ """
+ Class to generate a DTM Rips complex from a distance matrix or a point set,
+ in the way described in :cite:`dtmfiltrations`.
+ Remark that all the filtration values are doubled compared to the definition in the paper
+ for the consistency with RipsComplex.
+ :Requires: `SciPy <installation.html#scipy>`_
+ """
+ def __init__(self,
+ points=None,
+ distance_matrix=None,
+ k=1,
+ q=2,
+ max_filtration=float('inf')):
+ """
+ Args:
+ points (numpy.ndarray): array of points.
+ distance_matrix (numpy.ndarray): full distance matrix.
+ k (int): number of neighbors for the computation of DTM. Defaults to 1, which is equivalent to the usual Rips complex.
+ q (float): order used to compute the distance to measure. Defaults to 2.
+ max_filtration (float): specifies the maximal filtration value to be considered.
+ """
+ if distance_matrix is None:
+ if points is None:
+ # Empty Rips construction
+ points=[]
+ distance_matrix = cdist(points,points)
+ self.distance_matrix = distance_matrix
+
+ # TODO: address the error when k is too large
+ if k <= 1:
+ self.weights = [0] * len(distance_matrix)
+ else:
+ dtm = DistanceToMeasure(k, q=q, metric="precomputed")
+ self.weights = dtm.fit_transform(distance_matrix)
+ self.max_filtration = max_filtration
+
diff --git a/src/python/gudhi/hera/__init__.py b/src/python/gudhi/hera/__init__.py
new file mode 100644
index 00000000..f70b92b9
--- /dev/null
+++ b/src/python/gudhi/hera/__init__.py
@@ -0,0 +1,7 @@
+from .wasserstein import wasserstein_distance
+from .bottleneck import bottleneck_distance
+
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
diff --git a/src/python/gudhi/hera/bottleneck.cc b/src/python/gudhi/hera/bottleneck.cc
new file mode 100644
index 00000000..0cb562ce
--- /dev/null
+++ b/src/python/gudhi/hera/bottleneck.cc
@@ -0,0 +1,54 @@
+/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+ * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+ * Author(s): Marc Glisse
+ *
+ * Copyright (C) 2020 Inria
+ *
+ * Modification(s):
+ * - YYYY/MM Author: Description of the modification
+ */
+
+#include <pybind11_diagram_utils.h>
+
+#ifdef _MSC_VER
+// https://github.com/grey-narn/hera/issues/3
+// ssize_t is a non-standard type (well, posix)
+using py::ssize_t;
+#endif
+
+#include <bottleneck.h> // Hera
+
+double bottleneck_distance(Dgm d1, Dgm d2, double delta)
+{
+ // I *think* the call to request() has to be before releasing the GIL.
+ auto diag1 = numpy_to_range_of_pairs(d1);
+ auto diag2 = numpy_to_range_of_pairs(d2);
+
+ py::gil_scoped_release release;
+
+ if (delta == 0)
+ return hera::bottleneckDistExact(diag1, diag2);
+ else
+ return hera::bottleneckDistApprox(diag1, diag2, delta);
+}
+
+PYBIND11_MODULE(bottleneck, m) {
+ m.def("bottleneck_distance", &bottleneck_distance,
+ py::arg("X"), py::arg("Y"),
+ py::arg("delta") = .01,
+ R"pbdoc(
+ Compute the Bottleneck distance between two diagrams.
+ Points at infinity are supported.
+
+ .. note::
+ Points on the diagonal are not supported and must be filtered out before calling this function.
+
+ Parameters:
+ X (n x 2 numpy array): First diagram
+ Y (n x 2 numpy array): Second diagram
+ delta (float): Relative error 1+delta
+
+ Returns:
+ float: (approximate) bottleneck distance d_B(X,Y)
+ )pbdoc");
+}
diff --git a/src/python/gudhi/hera.cc b/src/python/gudhi/hera/wasserstein.cc
index ea80a9a8..1a21f02f 100644
--- a/src/python/gudhi/hera.cc
+++ b/src/python/gudhi/hera/wasserstein.cc
@@ -33,7 +33,7 @@ double wasserstein_distance(
return hera::wasserstein_dist(diag1, diag2, params);
}
-PYBIND11_MODULE(hera, m) {
+PYBIND11_MODULE(wasserstein, m) {
m.def("wasserstein_distance", &wasserstein_distance,
py::arg("X"), py::arg("Y"),
py::arg("order") = 1,
diff --git a/src/python/gudhi/representations/kernel_methods.py b/src/python/gudhi/representations/kernel_methods.py
index 596f4f07..23fd23c7 100644
--- a/src/python/gudhi/representations/kernel_methods.py
+++ b/src/python/gudhi/representations/kernel_methods.py
@@ -10,7 +10,7 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import pairwise_distances, pairwise_kernels
-from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance, _sklearn_wrapper, pairwise_persistence_diagram_distances, _sliced_wasserstein_distance, _persistence_fisher_distance
+from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance, _sklearn_wrapper, _pairwise, pairwise_persistence_diagram_distances, _sliced_wasserstein_distance, _persistence_fisher_distance
from .preprocessing import Padding
#############################################
@@ -60,7 +60,7 @@ def _persistence_scale_space_kernel(D1, D2, kernel_approx=None, bandwidth=1.):
weight_pss = lambda x: 1 if x[1] >= x[0] else -1
return 0.5 * _persistence_weighted_gaussian_kernel(DD1, DD2, weight=weight_pss, kernel_approx=kernel_approx, bandwidth=bandwidth)
-def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", **kwargs):
+def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", n_jobs=None, **kwargs):
"""
This function computes the kernel matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2).
@@ -68,38 +68,41 @@ def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein",
X (list of n numpy arrays of shape (numx2)): first list of persistence diagrams.
Y (list of m numpy arrays of shape (numx2)): second list of persistence diagrams (optional). If None, pairwise kernel values are computed from the first list only.
kernel: kernel to use. It can be either a string ("sliced_wasserstein", "persistence_scale_space", "persistence_weighted_gaussian", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs. If it is a function, make sure that it is symmetric.
+ n_jobs (int): number of jobs to use for the computation. This uses joblib.Parallel(prefer="threads"), so kernels that do not release the GIL may not scale unless run inside a `joblib.parallel_backend <https://joblib.readthedocs.io/en/latest/parallel.html#joblib.parallel_backend>`_ block.
**kwargs: optional keyword parameters. Any further parameters are passed directly to the kernel function. See the docs of the various kernel classes in this module.
Returns:
numpy array of shape (nxm): kernel matrix.
"""
XX = np.reshape(np.arange(len(X)), [-1,1])
- YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1])
+ YY = None if Y is None or Y is X else np.reshape(np.arange(len(Y)), [-1,1])
if kernel == "sliced_wasserstein":
- return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"]) / kwargs["bandwidth"])
+ return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"], n_jobs=n_jobs) / kwargs["bandwidth"])
elif kernel == "persistence_fisher":
- return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"]) / kwargs["bandwidth_fisher"])
+ return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"], n_jobs=n_jobs) / kwargs["bandwidth_fisher"])
elif kernel == "persistence_scale_space":
- return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs))
+ return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs), n_jobs=n_jobs)
elif kernel == "persistence_weighted_gaussian":
- return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs))
+ return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs), n_jobs=n_jobs)
else:
- return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(metric, **kwargs))
+ return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(metric, **kwargs), n_jobs=n_jobs)
class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
"""
This is a class for computing the sliced Wasserstein kernel matrix from a list of persistence diagrams. The sliced Wasserstein kernel is computed by exponentiating the corresponding sliced Wasserstein distance with a Gaussian kernel. See http://proceedings.mlr.press/v70/carriere17a.html for more details.
"""
- def __init__(self, num_directions=10, bandwidth=1.0):
+ def __init__(self, num_directions=10, bandwidth=1.0, n_jobs=None):
"""
Constructor for the SlicedWassersteinKernel class.
Parameters:
bandwidth (double): bandwidth of the Gaussian kernel applied to the sliced Wasserstein distance (default 1.).
num_directions (int): number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the kernel computation (default 10).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_kernels` for details.
"""
self.bandwidth = bandwidth
self.num_directions = num_directions
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -122,7 +125,7 @@ class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise sliced Wasserstein kernel values.
"""
- return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="sliced_wasserstein", bandwidth=self.bandwidth, num_directions=self.num_directions)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="sliced_wasserstein", bandwidth=self.bandwidth, num_directions=self.num_directions, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
@@ -141,7 +144,7 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
"""
This is a class for computing the persistence weighted Gaussian kernel matrix from a list of persistence diagrams. The persistence weighted Gaussian kernel is computed by convolving the persistence diagram points with weighted Gaussian kernels. See http://proceedings.mlr.press/v48/kusano16.html for more details.
"""
- def __init__(self, bandwidth=1., weight=lambda x: 1, kernel_approx=None):
+ def __init__(self, bandwidth=1., weight=lambda x: 1, kernel_approx=None, n_jobs=None):
"""
Constructor for the PersistenceWeightedGaussianKernel class.
@@ -149,9 +152,11 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
bandwidth (double): bandwidth of the Gaussian kernel with which persistence diagrams will be convolved (default 1.)
weight (function): weight function for the persistence diagram points (default constant function, ie lambda x: 1). This function must be defined on 2D points, ie lists or numpy arrays of the form [p_x,p_y].
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_kernels` for details.
"""
self.bandwidth, self.weight = bandwidth, weight
self.kernel_approx = kernel_approx
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -174,7 +179,7 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence weighted Gaussian kernel values.
"""
- return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_weighted_gaussian", bandwidth=self.bandwidth, weight=self.weight, kernel_approx=self.kernel_approx)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_weighted_gaussian", bandwidth=self.bandwidth, weight=self.weight, kernel_approx=self.kernel_approx, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
@@ -193,15 +198,17 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
"""
This is a class for computing the persistence scale space kernel matrix from a list of persistence diagrams. The persistence scale space kernel is computed by adding the symmetric to the diagonal of each point in each persistence diagram, with negative weight, and then convolving the points with a Gaussian kernel. See https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Reininghaus_A_Stable_Multi-Scale_2015_CVPR_paper.pdf for more details.
"""
- def __init__(self, bandwidth=1., kernel_approx=None):
+ def __init__(self, bandwidth=1., kernel_approx=None, n_jobs=None):
"""
Constructor for the PersistenceScaleSpaceKernel class.
Parameters:
bandwidth (double): bandwidth of the Gaussian kernel with which persistence diagrams will be convolved (default 1.)
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_kernels` for details.
"""
self.bandwidth, self.kernel_approx = bandwidth, kernel_approx
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -224,7 +231,7 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence scale space kernel values.
"""
- return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_scale_space", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_scale_space", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
@@ -243,7 +250,7 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
"""
This is a class for computing the persistence Fisher kernel matrix from a list of persistence diagrams. The persistence Fisher kernel is computed by exponentiating the corresponding persistence Fisher distance with a Gaussian kernel. See papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details.
"""
- def __init__(self, bandwidth_fisher=1., bandwidth=1., kernel_approx=None):
+ def __init__(self, bandwidth_fisher=1., bandwidth=1., kernel_approx=None, n_jobs=None):
"""
Constructor for the PersistenceFisherKernel class.
@@ -251,9 +258,11 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
bandwidth (double): bandwidth of the Gaussian kernel applied to the persistence Fisher distance (default 1.).
bandwidth_fisher (double): bandwidth of the Gaussian kernel used to turn persistence diagrams into probability distributions by PersistenceFisherDistance class (default 1.).
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_kernels` for details.
"""
self.bandwidth = bandwidth
self.bandwidth_fisher, self.kernel_approx = bandwidth_fisher, kernel_approx
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -276,7 +285,7 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence Fisher kernel values.
"""
- return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_fisher", bandwidth=self.bandwidth, bandwidth_fisher=self.bandwidth_fisher, kernel_approx=self.kernel_approx)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, kernel="persistence_fisher", bandwidth=self.bandwidth, bandwidth_fisher=self.bandwidth_fisher, kernel_approx=self.kernel_approx, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py
index 3e8487c4..142ddef1 100644
--- a/src/python/gudhi/representations/metrics.py
+++ b/src/python/gudhi/representations/metrics.py
@@ -12,6 +12,7 @@ from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import pairwise_distances
from gudhi.hera import wasserstein_distance as hera_wasserstein_distance
from .preprocessing import Padding
+from joblib import Parallel, delayed
#############################################
# Metrics ###################################
@@ -116,6 +117,20 @@ def _persistence_fisher_distance(D1, D2, kernel_approx=None, bandwidth=1.):
vectorj = vectorj/vectorj_sum
return np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
+def _pairwise(fallback, skipdiag, X, Y, metric, n_jobs):
+ if Y is not None:
+ return fallback(X, Y, metric=metric, n_jobs=n_jobs)
+ triu = np.triu_indices(len(X), k=skipdiag)
+ tril = (triu[1], triu[0])
+ par = Parallel(n_jobs=n_jobs, prefer="threads")
+ d = par(delayed(metric)([triu[0][i]], [triu[1][i]]) for i in range(len(triu[0])))
+ m = np.empty((len(X), len(X)))
+ m[triu] = d
+ m[tril] = d
+ if skipdiag:
+ np.fill_diagonal(m, 0)
+ return m
+
def _sklearn_wrapper(metric, X, Y, **kwargs):
"""
This function is a wrapper for any metric between two persistence diagrams that takes two numpy arrays of shapes (nx2) and (mx2) as arguments.
@@ -134,7 +149,7 @@ PAIRWISE_DISTANCE_FUNCTIONS = {
"persistence_fisher": _persistence_fisher_distance,
}
-def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwargs):
+def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", n_jobs=None, **kwargs):
"""
This function computes the distance matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2).
@@ -142,48 +157,51 @@ def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwa
X (list of n numpy arrays of shape (numx2)): first list of persistence diagrams.
Y (list of m numpy arrays of shape (numx2)): second list of persistence diagrams (optional). If None, pairwise distances are computed from the first list only.
metric: distance to use. It can be either a string ("sliced_wasserstein", "wasserstein", "hera_wasserstein" (Wasserstein distance computed with Hera---note that Hera is also used for the default option "wasserstein"), "pot_wasserstein" (Wasserstein distance computed with POT), "bottleneck", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs. If it is a function, make sure that it is symmetric and that it outputs 0 if called on the same two arrays.
+ n_jobs (int): number of jobs to use for the computation. This uses joblib.Parallel(prefer="threads"), so metrics that do not release the GIL may not scale unless run inside a `joblib.parallel_backend <https://joblib.readthedocs.io/en/latest/parallel.html#joblib.parallel_backend>`_ block.
**kwargs: optional keyword parameters. Any further parameters are passed directly to the distance function. See the docs of the various distance classes in this module.
Returns:
numpy array of shape (nxm): distance matrix
"""
XX = np.reshape(np.arange(len(X)), [-1,1])
- YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1])
+ YY = None if Y is None or Y is X else np.reshape(np.arange(len(Y)), [-1,1])
if metric == "bottleneck":
try:
from .. import bottleneck_distance
- return pairwise_distances(XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs))
+ return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs), n_jobs=n_jobs)
except ImportError:
print("Gudhi built without CGAL")
raise
elif metric == "pot_wasserstein":
try:
from gudhi.wasserstein import wasserstein_distance as pot_wasserstein_distance
- return pairwise_distances(XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs))
+ return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs), n_jobs=n_jobs)
except ImportError:
print("POT (Python Optimal Transport) is not installed. Please install POT or use metric='wasserstein' or metric='hera_wasserstein'")
raise
elif metric == "sliced_wasserstein":
Xproj = _compute_persistence_diagram_projections(X, **kwargs)
Yproj = None if Y is None else _compute_persistence_diagram_projections(Y, **kwargs)
- return pairwise_distances(XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj))
+ return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj), n_jobs=n_jobs)
elif type(metric) == str:
- return pairwise_distances(XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs))
+ return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs), n_jobs=n_jobs)
else:
- return pairwise_distances(XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs))
+ return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs), n_jobs=n_jobs)
class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
"""
This is a class for computing the sliced Wasserstein distance matrix from a list of persistence diagrams. The Sliced Wasserstein distance is computed by projecting the persistence diagrams onto lines, comparing the projections with the 1-norm, and finally integrating over all possible lines. See http://proceedings.mlr.press/v70/carriere17a.html for more details.
"""
- def __init__(self, num_directions=10):
+ def __init__(self, num_directions=10, n_jobs=None):
"""
Constructor for the SlicedWassersteinDistance class.
Parameters:
num_directions (int): number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the distance computation (default 10).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details.
"""
self.num_directions = num_directions
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -206,7 +224,7 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise sliced Wasserstein distances.
"""
- return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="sliced_wasserstein", num_directions=self.num_directions)
+ return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="sliced_wasserstein", num_directions=self.num_directions, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
@@ -227,14 +245,16 @@ class BottleneckDistance(BaseEstimator, TransformerMixin):
:Requires: `CGAL <installation.html#cgal>`_ :math:`\geq` 4.11.0
"""
- def __init__(self, epsilon=None):
+ def __init__(self, epsilon=None, n_jobs=None):
"""
Constructor for the BottleneckDistance class.
Parameters:
epsilon (double): absolute (additive) error tolerated on the distance (default is the smallest positive float), see :func:`gudhi.bottleneck_distance`.
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details.
"""
self.epsilon = epsilon
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -257,7 +277,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise bottleneck distances.
"""
- Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric="bottleneck", e=self.epsilon)
+ Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric="bottleneck", e=self.epsilon, n_jobs=self.n_jobs)
return Xfit
def __call__(self, diag1, diag2):
@@ -282,15 +302,17 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
"""
This is a class for computing the persistence Fisher distance matrix from a list of persistence diagrams. The persistence Fisher distance is obtained by computing the original Fisher distance between the probability distributions associated to the persistence diagrams given by convolving them with a Gaussian kernel. See http://papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details.
"""
- def __init__(self, bandwidth=1., kernel_approx=None):
+ def __init__(self, bandwidth=1., kernel_approx=None, n_jobs=None):
"""
Constructor for the PersistenceFisherDistance class.
Parameters:
bandwidth (double): bandwidth of the Gaussian kernel used to turn persistence diagrams into probability distributions (default 1.).
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details.
"""
self.bandwidth, self.kernel_approx = bandwidth, kernel_approx
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -313,7 +335,7 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence Fisher distances.
"""
- return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx)
+ return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx, n_jobs=self.n_jobs)
def __call__(self, diag1, diag2):
"""
@@ -333,7 +355,8 @@ class WassersteinDistance(BaseEstimator, TransformerMixin):
"""
This is a class for computing the Wasserstein distance matrix from a list of persistence diagrams.
"""
- def __init__(self, order=1, internal_p=np.inf, mode="hera", delta=0.01):
+
+ def __init__(self, order=1, internal_p=np.inf, mode="hera", delta=0.01, n_jobs=None):
"""
Constructor for the WassersteinDistance class.
@@ -342,6 +365,7 @@ class WassersteinDistance(BaseEstimator, TransformerMixin):
internal_p (int): ground metric on the (upper-half) plane (i.e. norm l_p in R^2), default value is `np.inf`, see :func:`gudhi.wasserstein.wasserstein_distance`.
mode (str): method for computing Wasserstein distance. Either "pot" or "hera". Default set to "hera".
delta (float): relative error 1+delta. Used only if mode == "hera".
+ n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details.
"""
self.order, self.internal_p, self.mode = order, internal_p, mode
if mode == "pot":
@@ -351,6 +375,7 @@ class WassersteinDistance(BaseEstimator, TransformerMixin):
else:
raise NameError("Unknown mode. Current available values for mode are 'hera' and 'pot'")
self.delta = delta
+ self.n_jobs = n_jobs
def fit(self, X, y=None):
"""
@@ -374,9 +399,9 @@ class WassersteinDistance(BaseEstimator, TransformerMixin):
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise Wasserstein distances.
"""
if self.metric == "hera_wasserstein":
- Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, delta=self.delta)
+ Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, delta=self.delta, n_jobs=self.n_jobs)
else:
- Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, matching=False)
+ Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, matching=False, n_jobs=self.n_jobs)
return Xfit
def __call__(self, diag1, diag2):