From cc42bcdf3323f2eb6edeaca105d29b32b394ca66 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 2 Jun 2020 07:33:31 +0200 Subject: Parallelism in pairwise_distances --- src/python/gudhi/representations/kernel_methods.py | 14 +++++------ src/python/gudhi/representations/metrics.py | 27 +++++++++++++++++----- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/python/gudhi/representations/kernel_methods.py b/src/python/gudhi/representations/kernel_methods.py index 596f4f07..c9bd9d01 100644 --- a/src/python/gudhi/representations/kernel_methods.py +++ b/src/python/gudhi/representations/kernel_methods.py @@ -10,7 +10,7 @@ import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import pairwise_distances, pairwise_kernels -from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance, _sklearn_wrapper, pairwise_persistence_diagram_distances, _sliced_wasserstein_distance, _persistence_fisher_distance +from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance, _sklearn_wrapper, _pairwise, pairwise_persistence_diagram_distances, _sliced_wasserstein_distance, _persistence_fisher_distance from .preprocessing import Padding ############################################# @@ -60,7 +60,7 @@ def _persistence_scale_space_kernel(D1, D2, kernel_approx=None, bandwidth=1.): weight_pss = lambda x: 1 if x[1] >= x[0] else -1 return 0.5 * _persistence_weighted_gaussian_kernel(DD1, DD2, weight=weight_pss, kernel_approx=kernel_approx, bandwidth=bandwidth) -def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", **kwargs): +def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", n_jobs=None, **kwargs): """ This function computes the kernel matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2). @@ -76,15 +76,15 @@ def pairwise_persistence_diagram_kernels(X, Y=None, kernel="sliced_wasserstein", XX = np.reshape(np.arange(len(X)), [-1,1]) YY = None if Y is None else np.reshape(np.arange(len(Y)), [-1,1]) if kernel == "sliced_wasserstein": - return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"]) / kwargs["bandwidth"]) + return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"], n_jobs=n_jobs) / kwargs["bandwidth"]) elif kernel == "persistence_fisher": - return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"]) / kwargs["bandwidth_fisher"]) + return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"], n_jobs=n_jobs) / kwargs["bandwidth_fisher"]) elif kernel == "persistence_scale_space": - return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs)) + return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(_persistence_scale_space_kernel, X, Y, **kwargs), n_jobs=n_jobs) elif kernel == "persistence_weighted_gaussian": - return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs)) + return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(_persistence_weighted_gaussian_kernel, X, Y, **kwargs), n_jobs=n_jobs) else: - return pairwise_kernels(XX, YY, metric=_sklearn_wrapper(metric, **kwargs)) + return _pairwise(pairwise_kernels, False, XX, YY, metric=_sklearn_wrapper(metric, **kwargs), n_jobs=n_jobs) class SlicedWassersteinKernel(BaseEstimator, TransformerMixin): """ diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index 8a32f7e9..23bccd68 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -12,6 +12,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import pairwise_distances from gudhi.hera import wasserstein_distance as hera_wasserstein_distance from .preprocessing import Padding +from joblib import Parallel, delayed, effective_n_jobs ############################################# # Metrics ################################### @@ -116,6 +117,20 @@ def _persistence_fisher_distance(D1, D2, kernel_approx=None, bandwidth=1.): vectorj = vectorj/vectorj_sum return np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) ) +def _pairwise(fallback, skipdiag, X, Y, metric, n_jobs): + if Y is not None: + return fallback(X, Y, metric=metric, n_jobs=n_jobs) + triu = np.triu_indices(len(X), k=skipdiag) + tril = (triu[1], triu[0]) + par = Parallel(n_jobs=n_jobs, prefer="threads") + d = par(delayed(metric)([triu[0][i]], [triu[1][i]]) for i in range(len(triu[0]))) + m = np.empty((len(X), len(X))) + m[triu] = d + m[tril] = d + if skipdiag: + np.fill_diagonal(m, 0) + return m + def _sklearn_wrapper(metric, X, Y, **kwargs): """ This function is a wrapper for any metric between two persistence diagrams that takes two numpy arrays of shapes (nx2) and (mx2) as arguments. @@ -134,7 +149,7 @@ PAIRWISE_DISTANCE_FUNCTIONS = { "persistence_fisher": _persistence_fisher_distance, } -def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwargs): +def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", n_jobs=None, **kwargs): """ This function computes the distance matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2). @@ -152,25 +167,25 @@ def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwa if metric == "bottleneck": try: from .. import bottleneck_distance - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(bottleneck_distance, X, Y, **kwargs), n_jobs=n_jobs) except ImportError: print("Gudhi built without CGAL") raise elif metric == "pot_wasserstein": try: from gudhi.wasserstein import wasserstein_distance as pot_wasserstein_distance - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(pot_wasserstein_distance, X, Y, **kwargs), n_jobs=n_jobs) except ImportError: print("POT (Python Optimal Transport) is not installed. Please install POT or use metric='wasserstein' or metric='hera_wasserstein'") raise elif metric == "sliced_wasserstein": Xproj = _compute_persistence_diagram_projections(X, **kwargs) Yproj = None if Y is None else _compute_persistence_diagram_projections(Y, **kwargs) - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(_sliced_wasserstein_distance_on_projections, Xproj, Yproj), n_jobs=n_jobs) elif type(metric) == str: - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(PAIRWISE_DISTANCE_FUNCTIONS[metric], X, Y, **kwargs), n_jobs=n_jobs) else: - return pairwise_distances(XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs)) + return _pairwise(pairwise_distances, True, XX, YY, metric=_sklearn_wrapper(metric, X, Y, **kwargs), n_jobs=n_jobs) class SlicedWassersteinDistance(BaseEstimator, TransformerMixin): """ -- cgit v1.2.3