From cc42bcdf3323f2eb6edeaca105d29b32b394ca66 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 2 Jun 2020 07:33:31 +0200 Subject: Parallelism in pairwise_distances --- src/python/gudhi/representations/metrics.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'src/python/gudhi/representations/metrics.py') diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index 8a32f7e9..23bccd68 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -12,6 +12,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import pairwise_distances from gudhi.hera import wasserstein_distance as hera_wasserstein_distance from .preprocessing import Padding +from joblib import Parallel, delayed, effective_n_jobs ############################################# # Metrics ################################### @@ -116,6 +117,20 @@ def _persistence_fisher_distance(D1, D2, kernel_approx=None, bandwidth=1.): vectorj = vectorj/vectorj_sum return np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) ) +def _pairwise(fallback, skipdiag, X, Y, metric, n_jobs): + if Y is not None: + return fallback(X, Y, metric=metric, n_jobs=n_jobs) + triu = np.triu_indices(len(X), k=skipdiag) + tril = (triu[1], triu[0]) + par = Parallel(n_jobs=n_jobs, prefer="threads") + d = par(delayed(metric)([triu[0][i]], [triu[1][i]]) for i in range(len(triu[0]))) + m = np.empty((len(X), len(X))) + m[triu] = d + m[tril] = d + if skipdiag: + np.fill_diagonal(m, 0) + return m + def _sklearn_wrapper(metric, X, Y, **kwargs): """ This function is a wrapper for any metric between two persistence diagrams that takes two numpy arrays of shapes (nx2) and (mx2) as arguments. @@ -134,7 +149,7 @@ PAIRWISE_DISTANCE_FUNCTIONS = { "persistence_fisher": _persistence_fisher_distance, } -def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwargs): +def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", n_jobs=None, **kwargs): """ This function computes the distance matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2). @@ -152,25 +167,25 @@ def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwa if metric == "bottleneck": try: from .. import bottleneck_distance - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs), n_jobs=n_jobs) except ImportError: print("Gudhi built without CGAL") raise elif metric == "pot_wasserstein": try: from gudhi.wasserstein import wasserstein_distance as pot_wasserstein_distance - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs), n_jobs=n_jobs) except ImportError: print("POT (Python Optimal Transport) is not installed. Please install POT or use metric='wasserstein' or metric='hera_wasserstein'") raise elif metric == "sliced_wasserstein": Xproj = _compute_persistence_diagram_projections(X, **kwargs) Yproj = None if Y is None else _compute_persistence_diagram_projections(Y, **kwargs) - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj), n_jobs=n_jobs) elif type(metric) == str: - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs), n_jobs=n_jobs) else: - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs), n_jobs=n_jobs) class SlicedWassersteinDistance(BaseEstimator, TransformerMixin): """ -- cgit v1.2.3 From aebfa07217f58943753967cfd3d7daa9f28d5650 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 2 Jun 2020 17:56:18 +0200 Subject: More n_jobs in metrics --- src/python/gudhi/representations/metrics.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'src/python/gudhi/representations/metrics.py') diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index 23bccd68..84907160 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -12,7 +12,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import pairwise_distances from gudhi.hera import wasserstein_distance as hera_wasserstein_distance from .preprocessing import Padding -from joblib import Parallel, delayed, effective_n_jobs +from joblib import Parallel, delayed ############################################# # Metrics ################################### @@ -157,6 +157,7 @@ def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", n_job X (list of n numpy arrays of shape (numx2)): first list of persistence diagrams. Y (list of m numpy arrays of shape (numx2)): second list of persistence diagrams (optional). If None, pairwise distances are computed from the first list only. metric: distance to use. It can be either a string ("sliced_wasserstein", "wasserstein", "hera_wasserstein" (Wasserstein distance computed with Hera---note that Hera is also used for the default option "wasserstein"), "pot_wasserstein" (Wasserstein distance computed with POT), "bottleneck", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs. If it is a function, make sure that it is symmetric and that it outputs 0 if called on the same two arrays. + n_jobs (int): number of jobs to use for the computation. This uses joblib.Parallel(prefer="threads"), so metrics that do not release the GIL may not scale unless run inside a `joblib.parallel_backend `_ block. **kwargs: optional keyword parameters. Any further parameters are passed directly to the distance function. See the docs of the various distance classes in this module. Returns: @@ -191,14 +192,16 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin): """ This is a class for computing the sliced Wasserstein distance matrix from a list of persistence diagrams. The Sliced Wasserstein distance is computed by projecting the persistence diagrams onto lines, comparing the projections with the 1-norm, and finally integrating over all possible lines. See http://proceedings.mlr.press/v70/carriere17a.html for more details. """ - def __init__(self, num_directions=10): + def __init__(self, num_directions=10, n_jobs=None): """ Constructor for the SlicedWassersteinDistance class. Parameters: num_directions (int): number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the distance computation (default 10). + n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details. """ self.num_directions = num_directions + self.n_jobs = n_jobs def fit(self, X, y=None): """ @@ -221,7 +224,7 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin): Returns: numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise sliced Wasserstein distances. """ - return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="sliced_wasserstein", num_directions=self.num_directions) + return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="sliced_wasserstein", num_directions=self.num_directions, n_jobs=self.n_jobs) def __call__(self, diag1, diag2): """ @@ -242,14 +245,16 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): :Requires: `CGAL `_ :math:`\geq` 4.11.0 """ - def __init__(self, epsilon=None): + def __init__(self, epsilon=None, n_jobs=None): """ Constructor for the BottleneckDistance class. Parameters: epsilon (double): absolute (additive) error tolerated on the distance (default is the smallest positive float), see :func:`gudhi.bottleneck_distance`. + n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details. """ self.epsilon = epsilon + self.n_jobs = n_jobs def fit(self, X, y=None): """ @@ -272,7 +277,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): Returns: numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise bottleneck distances. """ - Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric="bottleneck", e=self.epsilon) + Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric="bottleneck", e=self.epsilon, n_jobs=self.n_jobs) return Xfit def __call__(self, diag1, diag2): @@ -297,15 +302,17 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin): """ This is a class for computing the persistence Fisher distance matrix from a list of persistence diagrams. The persistence Fisher distance is obtained by computing the original Fisher distance between the probability distributions associated to the persistence diagrams given by convolving them with a Gaussian kernel. See http://papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details. """ - def __init__(self, bandwidth=1., kernel_approx=None): + def __init__(self, bandwidth=1., kernel_approx=None, n_jobs=None): """ Constructor for the PersistenceFisherDistance class. Parameters: bandwidth (double): bandwidth of the Gaussian kernel used to turn persistence diagrams into probability distributions (default 1.). kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance). + n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details. """ self.bandwidth, self.kernel_approx = bandwidth, kernel_approx + self.n_jobs = n_jobs def fit(self, X, y=None): """ @@ -328,7 +335,7 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin): Returns: numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence Fisher distances. """ - return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx) + return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx, n_jobs=self.n_jobs) def __call__(self, diag1, diag2): """ @@ -347,7 +354,7 @@ class WassersteinDistance(BaseEstimator, TransformerMixin): """ This is a class for computing the Wasserstein distance matrix from a list of persistence diagrams. """ - def __init__(self, order=2, internal_p=2, mode="pot", delta=0.01): + def __init__(self, order=2, internal_p=2, mode="pot", delta=0.01, n_jobs=None): """ Constructor for the WassersteinDistance class. @@ -356,10 +363,12 @@ class WassersteinDistance(BaseEstimator, TransformerMixin): internal_p (int): ground metric on the (upper-half) plane (i.e. norm l_p in R^2), default value is 2 (euclidean norm), see :func:`gudhi.wasserstein.wasserstein_distance`. mode (str): method for computing Wasserstein distance. Either "pot" or "hera". delta (float): relative error 1+delta. Used only if mode == "hera". + n_jobs (int): number of jobs to use for the computation. See :func:`pairwise_persistence_diagram_distances` for details. """ self.order, self.internal_p, self.mode = order, internal_p, mode self.metric = "pot_wasserstein" if mode == "pot" else "hera_wasserstein" self.delta = delta + self.n_jobs = n_jobs def fit(self, X, y=None): """ @@ -383,9 +392,9 @@ class WassersteinDistance(BaseEstimator, TransformerMixin): numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise Wasserstein distances. """ if self.metric == "hera_wasserstein": - Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, delta=self.delta) + Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, delta=self.delta, n_jobs=self.n_jobs) else: - Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, matching=False) + Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric=self.metric, order=self.order, internal_p=self.internal_p, matching=False, n_jobs=self.n_jobs) return Xfit def __call__(self, diag1, diag2): -- cgit v1.2.3 From 69852030a6d1b68f3283b5727c6b944a9c7f5e73 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 2 Jun 2020 21:07:29 +0200 Subject: Some test --- src/python/gudhi/representations/kernel_methods.py | 2 +- src/python/gudhi/representations/metrics.py | 2 +- src/python/test/test_representations.py | 33 +++++++++++++++++++++- 3 files changed, 34 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi/representations/metrics.py') diff --git a/src/python/gudhi/representations/kernel_methods.py b/src/python/gudhi/representations/kernel_methods.py index 6e4f0619..23fd23c7 100644 --- a/src/python/gudhi/representations/kernel_methods.py +++ b/src/python/gudhi/representations/kernel_methods.py @@ -75,7 +75,7 @@ def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", numpy array of shape (nxm): kernel matrix. """ XX = np.reshape(np.arange(len(X)), [-1,1]) - YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1]) + YY = None if Y is None or Y is X else np.reshape(np.arange(len(Y)), [-1,1]) if kernel == "sliced_wasserstein": return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"], n_jobs=n_jobs) / kwargs["bandwidth"]) elif kernel == "persistence_fisher": diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index 84907160..cf2e0879 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -164,7 +164,7 @@ def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", n_job numpy array of shape (nxm): distance matrix """ XX = np.reshape(np.arange(len(X)), [-1,1]) - YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1]) + YY = None if Y is None or Y is X else np.reshape(np.arange(len(Y)), [-1,1]) if metric == "bottleneck": try: from .. import bottleneck_distance diff --git a/src/python/test/test_representations.py b/src/python/test/test_representations.py index dba7f952..589cee00 100755 --- a/src/python/test/test_representations.py +++ b/src/python/test/test_representations.py @@ -1,12 +1,43 @@ import os import sys import matplotlib.pyplot as plt +import numpy as np +import pytest + def test_representations_examples(): # Disable graphics for testing purposes - plt.show = lambda:None + plt.show = lambda: None here = os.path.dirname(os.path.realpath(__file__)) sys.path.append(here + "/../example") import diagram_vectorizations_distances_kernels return None + + +from gudhi.representations.metrics import * +from gudhi.representations.kernel_methods import * + + +def _n_diags(n): + l = [] + for _ in range(n): + a = np.random.rand(50, 2) + a[:, 1] += a[:, 0] # So that y >= x + l.append(a) + return l + + +def test_multiple(): + l1 = _n_diags(9) + l2 = _n_diags(11) + l1b = l1.copy() + d1 = pairwise_persistence_diagram_distances(l1, e=0.00001, n_jobs=4) + d2 = BottleneckDistance(epsilon=0.00001).fit_transform(l1) + d3 = pairwise_persistence_diagram_distances(l1, l1b, e=0.00001, n_jobs=4) + assert d1 == pytest.approx(d2) + assert d3 == pytest.approx(d2, abs=1e-5) # Because of 0 entries (on the diagonal) + d1 = pairwise_persistence_diagram_distances(l1, l2, metric="wasserstein", order=2, internal_p=2) + d2 = WassersteinDistance(order=2, internal_p=2, n_jobs=4).fit(l2).transform(l1) + print(d1.shape, d2.shape) + assert d1 == pytest.approx(d2, rel=.02) -- cgit v1.2.3 From b0c4b1920faa512b25e7c34cbe83c8e625ac3613 Mon Sep 17 00:00:00 2001 From: tlacombe Date: Mon, 29 Jun 2020 09:53:31 +0200 Subject: update default param for representation.metrics --- src/python/gudhi/representations/metrics.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi/representations/metrics.py') diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index 8a32f7e9..3e8487c4 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -328,22 +328,28 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin): """ return _persistence_fisher_distance(diag1, diag2, bandwidth=self.bandwidth, kernel_approx=self.kernel_approx) + class WassersteinDistance(BaseEstimator, TransformerMixin): """ This is a class for computing the Wasserstein distance matrix from a list of persistence diagrams. """ - def __init__(self, order=2, internal_p=2, mode="pot", delta=0.01): + def __init__(self, order=1, internal_p=np.inf, mode="hera", delta=0.01): """ Constructor for the WassersteinDistance class. Parameters: - order (int): exponent for Wasserstein, default value is 2., see :func:`gudhi.wasserstein.wasserstein_distance`. - internal_p (int): ground metric on the (upper-half) plane (i.e. norm l_p in R^2), default value is 2 (euclidean norm), see :func:`gudhi.wasserstein.wasserstein_distance`. - mode (str): method for computing Wasserstein distance. Either "pot" or "hera". + order (int): exponent for Wasserstein, default value is 1., see :func:`gudhi.wasserstein.wasserstein_distance`. + internal_p (int): ground metric on the (upper-half) plane (i.e. norm l_p in R^2), default value is `np.inf`, see :func:`gudhi.wasserstein.wasserstein_distance`. + mode (str): method for computing Wasserstein distance. Either "pot" or "hera". Default set to "hera". delta (float): relative error 1+delta. Used only if mode == "hera". """ self.order, self.internal_p, self.mode = order, internal_p, mode - self.metric = "pot_wasserstein" if mode == "pot" else "hera_wasserstein" + if mode == "pot": + self.metric = "pot_wasserstein" + elif mode == "hera": + self.metric = "hera_wasserstein" + else: + raise NameError("Unknown mode. Current available values for mode are 'hera' and 'pot'") self.delta = delta def fit(self, X, y=None): -- cgit v1.2.3