From 80aa14d1b92d1a61366d798b07073289d4db4fda Mon Sep 17 00:00:00 2001 From: tlacombe Date: Thu, 5 Dec 2019 18:42:48 +0100 Subject: first version of barycenter for persistence diagrams --- src/python/doc/barycenter_sum.inc | 22 +++ src/python/doc/barycenter_user.rst | 51 ++++++ src/python/gudhi/barycenter.py | 322 +++++++++++++++++++++++++------------ 3 files changed, 292 insertions(+), 103 deletions(-) create mode 100644 src/python/doc/barycenter_sum.inc create mode 100644 src/python/doc/barycenter_user.rst diff --git a/src/python/doc/barycenter_sum.inc b/src/python/doc/barycenter_sum.inc new file mode 100644 index 00000000..7801a845 --- /dev/null +++ b/src/python/doc/barycenter_sum.inc @@ -0,0 +1,22 @@ +.. table:: + :widths: 30 50 20 + + +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+ + | .. figure:: | A Frechet mean (or barycenter) is a generalization of the arithmetic | :Author: Theo Lacombe | + | ../../doc/Barycenter/barycenter.png | mean in a non linear space such as the one of persistence diagrams. | | + | :figclass: align-center | Given a set of persistence diagrams :math:`\mu_1 \dots \mu_n`, it is | :Introduced in: GUDHI 3.1.0 | + | | defined as a minimizer of the variance functional, that is of | | + | Illustration of Frechet mean between persistence | :math:`\mu \mapsto \sum_{i=1}^n d_2(\mu,\mu_i)^2`. | :Copyright: MIT | + | diagrams. | where :math:`d_2` denotes the Wasserstein-2 distance between persis- | | + | | tence diagrams. | | + | | It is known to exist and is generically unique. However, an exact | | + | | computation is in general untractable. Current implementation avai- | :Requires: Python Optimal Transport (POT) :math:`\geq` 0.5.1 | + | | -lable is based on [Turner et al, 2014], and uses an EM-scheme to | | + | | provide a local minimum of the variance functional (somewhat similar | | + | | to the Lloyd algorithm to estimate a solution to the k-means | | + | | problem). The combinatorial structure of the algorithm limits its | | + | | scaling on large scale problems (thousands of diagrams and of points | | + | | per diagram). | | + +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+ + | * :doc:`barycenter_user` | | + +-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/src/python/doc/barycenter_user.rst b/src/python/doc/barycenter_user.rst new file mode 100644 index 00000000..fae2854a --- /dev/null +++ b/src/python/doc/barycenter_user.rst @@ -0,0 +1,51 @@ +:orphan: + +.. To get rid of WARNING: document isn't included in any toctree + +Wasserstein distance user manual +================================ +Definition +---------- + +.. include:: wasserstein_distance_sum.inc + +This implementation is based on ideas from "Large Scale Computation of Means and Cluster for Persistence Diagrams via Optimal Transport". + +Function +-------- +.. autofunction:: gudhi.barycenter.lagrangian_barycenter + + +Basic example +------------- + +This example computes the Frechet mean (aka Wasserstein barycenter) between four persistence diagrams. +It is initialized on the 4th diagram, which is the empty diagram. It is encoded by np.array([]). +Note that persistence diagrams must be submitted as (n x 2) numpy arrays and must not contain inf values. + +.. testcode:: + + import gudhi.barycenter + import numpy as np + + dg1 = np.array([[0.2, 0.5]]) + dg2 = np.array([[0.2, 0.7]]) + dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]]) + dg4 = np.array([]) + + bary = gudhi.barycenter.lagrangian_barycenter(pdiagset=[dg1, dg2, dg3, dg4],init=3)) + + message = "Wasserstein barycenter estimated:" + print(message) + print(bary) + +The output is: + +.. testoutput:: + + Wasserstein barycenter estimated: + [[0.27916667 0.55416667] + [0.7375 0.7625 ] + [0.2375 0.2625 ]] + + diff --git a/src/python/gudhi/barycenter.py b/src/python/gudhi/barycenter.py index 85666631..3cd214a7 100644 --- a/src/python/gudhi/barycenter.py +++ b/src/python/gudhi/barycenter.py @@ -1,75 +1,105 @@ import ot import numpy as np -import matplotlib.pyplot as plt -from matplotlib.patches import Polygon +import scipy.spatial.distance as sc -def _proj_on_diag(x): - """ - :param x: numpy.array of length 2, encoding a point on the upper half plane. - :returns: numpy.array of length 2, orthogonal projection of the point onto - the diagonal. - """ - return np.array([(x[0] + x[1]) / 2, (x[0] + x[1]) / 2]) +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Theo Lacombe +# +# Copyright (C) 2019 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification -def _norm2(x, y): - """ - :param x: numpy.array of length 2, encoding a point on the upper half plane. - :param y: numpy.array of length 2, encoding a point on the upper half plane. - :returns: distance between the two points for the euclidean norm. - """ - return (y[0] - x[0])**2 + (y[1] - x[1])**2 +def _proj_on_diag(w): + ''' + Util function to project a point on the diag. + ''' + return np.array([(w[0] + w[1])/2 , (w[0] + w[1])/2]) -def _cost_matrix(X, Y): - """ - :param X: (n x 2) numpy.array encoding the first diagram - :param Y: (m x 2) numpy.array encoding the second diagram - :return: numpy.array with size (k x k) where k = |X| + |Y|, encoding the - cost matrix between points (including the diagonal, with repetition to - ensure one-to-one matchings. - """ - n, m = len(X), len(Y) - k = n + m - M = np.zeros((k, k)) - for i in range(n): # go throught X points - x_i = X[i] - p_x_i = _proj_on_diag(x_i) # proj of x_i on the diagonal - dist_x_delta = _norm2(x_i, p_x_i) # distance to the diagonal regarding the ground norm - for j in range(m): # go throught d_2 points - y_j = Y[j] - p_y_j = _proj_on_diag(y_j) - M[i, j] = _norm2(x_i, y_j) - dist_y_delta = _norm2(y_j, p_y_j) - for it in range(m): - M[n + it, j] = dist_y_delta - for it in range(n): - M[i, m + it] = dist_x_delta - - return M - - -def _optimal_matching(M): + +def _proj_on_diag_array(X): + ''' + :param X: (n x 2) array encoding the points of a persistent diagram. + :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal + ''' + Z = (X[:,0] + X[:,1]) / 2. + return np.array([Z , Z]).T + + +def _build_dist_matrix(X, Y, p=2., q=2.): + ''' + :param X: (n x 2) numpy.array encoding the (points of the) first diagram. + :param Y: (m x 2) numpy.array encoding the second diagram. + :param q: Ground metric (i.e. norm l_q). + :param p: exponent for the Wasserstein metric. + :returns: (n+1) x (m+1) np.array encoding the cost matrix C. + For 1 <= i <= n, 1 <= j <= m, C[i,j] encodes the distance between X[i] and Y[j], while C[i, m+1] (resp. C[n+1, j]) encodes the distance (to the p) between X[i] (resp Y[j]) and its orthogonal proj onto the diagonal. + note also that C[n+1, m+1] = 0 (it costs nothing to move from the diagonal to the diagonal). + Note that for lagrangian_barycenter, one must use p=q=2. + ''' + Xdiag = _proj_on_diag_array(X) + Ydiag = _proj_on_diag_array(Y) + if np.isinf(q): + C = sc.cdist(X, Y, metric='chebyshev')**p + Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p + Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p + else: + C = sc.cdist(X,Y, metric='minkowski', p=q)**p + Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p + Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p + Cf = np.hstack((C, Cxd[:,None])) + Cdy = np.append(Cdy, 0) + + Cf = np.vstack((Cf, Cdy[None,:])) + + return Cf + + +def _optimal_matching(X, Y): """ - :param M: numpy.array of size (k x k), encoding the cost matrix between the - points of two diagrams. - :returns: list of length (k) such that L[i] = j if and only if P[i,j]=1 - where P is a bi-stochastic matrix that minimize . + :param X: numpy.array of size (n x 2) + :param Y: numpy.array of size (m x 2) + :returns: numpy.array of shape (k x 2) encoding the list of edges in the optimal matching. + That is, [[(i, j) ...], where (i,j) indicates that X[i] is matched to Y[j] + if i > len(X) or j > len(Y), it means they represent the diagonal. + """ - n = len(M) - # if input weights are empty lists, pot treats the uniform assignement - # problem and returns a bistochastic matrix (up to *n). - P = ot.emd(a=[], b=[], M=M) * n - # return the list of indices j such that L[i] = j iff P[i,j] = 1 - return np.nonzero(P)[1] + + n = len(X) + m = len(Y) + if X.size == 0: # X is empty + if Y.size == 0: # Y is empty + return np.array([[0,0]]) # the diagonal is matched to the diagonal and that's it... + else: + return np.column_stack([np.zeros(m+1, dtype=int), np.arange(m+1, dtype=int)]) # TO BE CORRECTED + elif Y.size == 0: # X is not empty but Y is empty + return np.column_stack([np.zeros(n+1, dtype=int), np.arange(n+1, dtype=int)]) # TO BE CORRECTED + + # we know X, Y are not empty diags now + M = _build_dist_matrix(X, Y) + + a = np.full(n+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here. + a[-1] = a[-1] * m # normalized so that we have a probability measure, required by POT + b = np.full(m+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here. + b[-1] = b[-1] * n # so that we have a probability measure, required by POT + P = ot.emd(a=a, b=b, M=M)*(n+m) + # Note : it seems POT return a permutation matrix in this situation, + # ...guarantee...? + # It should be enough to check that the algorithm only iterates on vertices of the transportation polytope. + P[P < 0.5] = 0 # dirty trick to avoid some numerical issues... to be improved. + # return the list of (i,j) such that P[i,j] > 0, i.e. x_i is matched to y_j (should it be the diag). + res = np.nonzero(P) + return np.column_stack(res) def _mean(x, m): """ - :param x: a list of 2D-points, of diagonal, x_0... x_{k-1} + :param x: a list of 2D-points, off diagonal, x_0... x_{k-1} :param m: total amount of points taken into account, that is we have (m-k) copies of diagonal - :returns: the weighted mean of x with (m-k) copies of Delta taken into - account. + :returns: the weighted mean of x with (m-k) copies of the diagonal """ k = len(x) if k > 0: @@ -88,44 +118,54 @@ def lagrangian_barycenter(pdiagset, init=None, verbose=False): :param pdiagset: a list of size N containing numpy.array of shape (n x 2) (n can variate), encoding a set of persistence diagrams with only finite coordinates. - :param init: The initial value for barycenter estimate. - If None, init is made on a random diagram from the dataset. - Otherwise, it must be a (n x 2) numpy.array enconding a persistence diagram with n points. - :param verbose: if True, returns additional information about the - barycenters (assignment and energy). - :returns: If not verbose (default), a numpy.array encoding + :param init: The initial value for barycenter estimate. + If None, init is made on a random diagram from the dataset. + Otherwise, it must be an int (then we init with diagset[init]) + or a (n x 2) numpy.array enconding a persistence diagram with n points. + :param verbose: if True, returns additional information about the + barycenters (assignment and energy). + :returns: If not verbose (default), a numpy.array encoding the barycenter estimate (local minima of the energy function). If verbose, returns a triplet (Y, a, e) where Y is the barycenter estimate, a is the assignments between the points of Y and thoses of the diagrams, and e is the energy value reached by the estimate. """ - m = len(pdiagset) # number of diagrams we are averaging - X = pdiagset # to shorten notations + X = pdiagset # to shorten notations, not a copy + m = len(X) # number of diagrams we are averaging + if m == 0: + print("Warning: computing barycenter of empty diag set. Returns None") + return None + nb_off_diag = np.array([len(X_i) for X_i in X]) # store the number of off-diagonal point for each of the X_i # Initialisation of barycenter if init is None: i0 = np.random.randint(m) # Index of first state for the barycenter - Y = X[i0].copy() + Y = X[i0].copy() #copy() ensure that we do not modify X[i0] else: - Y = init.copy() + if type(init)==int: + Y = X[init].copy() + else: + Y = init.copy() - not_converged = True # stoping criterion - while not_converged: + converged = False # stoping criterion + while not converged: K = len(Y) # current nb of points in Y (some might be on diagonal) - G = np.zeros((K, m)) # will store for each j, the (index) point matched in each other diagram (might be the diagonal). + G = np.zeros((K, m), dtype=int)-1 # will store for each j, the (index) point matched in each other diagram (might be the diagonal). + # that is G[j, i] = k <=> y_j is matched to + # x_k in the diagram i-th diagram X[i] updated_points = np.zeros((K, 2)) # will store the new positions of # the points of Y. # If points disappear, there thrown # on [0,0] by default. - new_created_points = [] # will store eventual new points. + new_created_points = [] # will store potential new points. # Step 1 : compute optimal matching (Y, X_i) for each X_i + # and create new points in Y if needed for i in range(m): - M = _cost_matrix(Y, X[i]) - indices = _optimal_matching(M) - for y_j, x_i_j in enumerate(indices): + indices = _optimal_matching(Y, X[i]) + for y_j, x_i_j in indices: if y_j < K: # we matched an off diagonal point to x_i_j... if x_i_j < nb_off_diag[i]: # ...which is also an off-diagonal point G[y_j, i] = x_i_j @@ -136,32 +176,40 @@ def lagrangian_barycenter(pdiagset, init=None, verbose=False): new_y = _mean(np.array([X[i][x_i_j]]), m) # Average this point with (m-1) copies of Delta new_created_points.append(new_y) - # Step 2 : Compute new points (mean) + # Step 2 : Update current point position thanks to the groupings computed + + to_delete = [] for j in range(K): - matched_points = [X[i][int(G[j, i])] for i in range(m) if G[j, i] > -1] - updated_points[j] = _mean(matched_points, m) + matched_points = [X[i][G[j, i]] for i in range(m) if G[j, i] > -1] + new_y_j = _mean(matched_points, m) + if not np.array_equal(new_y_j, np.array([0,0])): + updated_points[j] = new_y_j + else: # this points is no longer of any use. + to_delete.append(j) + # we remove the point to be deleted now. + updated_points = np.delete(updated_points, to_delete, axis=0) # cannot be done in-place. - if new_created_points: + + if new_created_points: # we cannot converge if there have been new created points. Y = np.concatenate((updated_points, new_created_points)) else: + # Step 3 : we check convergence + if np.array_equal(updated_points, Y): + converged = True Y = updated_points - # Step 3 : we update our estimation of the barycenter - if len(new_created_points) == 0 and np.array_equal(updated_points, Y): - not_converged = False if verbose: matchings = [] - energy = 0 + #energy = 0 n_y = len(Y) for i in range(m): - M = _cost_matrix(Y, X[i]) - edges = _optimal_matching(M) + edges = _optimal_matching(Y, X[i]) matchings.append([x_i_j for (y_j, x_i_j) in enumerate(edges) if y_j < n_y]) - energy += sum([M[i,j] for i,j in enumerate(edges)]) + # energy += sum([M[i,j] for i,j in enumerate(edges)]) - energy = energy/m - return Y, matchings, energy + # energy = energy/m + return Y, matchings #, energy else: return Y @@ -174,6 +222,11 @@ def _plot_barycenter(X, Y, matchings): the i-th point of the barycenter is grouped with the j-th point of the k-th diagram. """ + # import matplotlib now to avoid useless dependancies + + import matplotlib.pyplot as plt + from matplotlib.patches import Polygon + fig = plt.figure() ax = fig.add_subplot(111) @@ -182,7 +235,7 @@ def _plot_barycenter(X, Y, matchings): indices = matchings[i] n_i = len(X[i]) - for (y_j, x_i_j) in enumerate(indices): + for (y_j, x_i_j) in indices: y = Y[y_j] if y[0] != y[1]: if x_i_j < n_i: # not mapped with the diag @@ -192,16 +245,20 @@ def _plot_barycenter(X, Y, matchings): ax.plot([y[0], x[0]], [y[1], x[1]], c='black', linestyle="dashed") - ax.scatter(Y[:,0], Y[:,1], color='purple', marker='d') + ax.scatter(Y[:,0], Y[:,1], color='purple', marker='d', zorder=2) - for dgm in X: - ax.scatter(dgm[:,0], dgm[:,1], marker ='o') + for X_i in X: + if X_i.size > 0: + ax.scatter(X_i[:,0], X_i[:,1], marker ='o', zorder=2) shift = 0.1 # for improved rendering - xmin = min([np.min(x[:,0]) for x in X]) - shift - xmax = max([np.max(x[:,0]) for x in X]) + shift - ymin = min([np.max(x[:,1]) for x in X]) - shift - ymax = max([np.max(x[:,1]) for x in X]) + shift + try: + xmin = np.min(np.array([np.min(x[:,0]) for x in X if len(x) > 0]) - shift) + xmax = np.max(np.array([np.max(x[:,0]) for x in X if len(x) > 0]) + shift) + ymin = np.min(np.array([np.max(x[:,1]) for x in X if len(x) > 0]) - shift) + ymax = np.max(np.array([np.max(x[:,1]) for x in X if len(x) > 0]) + shift) + except ValueError: # to handle the pecular case where we only average empty diagrams. + xmin, xmax, ymin, ymax = 0, 1, 0, 1 themin = min(xmin, ymin) themax = max(xmax, ymax) ax.set_xlim(themin, themax) @@ -212,12 +269,71 @@ def _plot_barycenter(X, Y, matchings): ax.set_aspect('equal', adjustable='box') ax.set_title("Estimated barycenter") + plt.show() + + +def _test_perf(): + nb_repeat = 10 + nb_points_in_dgm = [5, 10, 20, 50, 100] + nb_dmg = [3, 5, 10, 20] + + from time import time + for m in nb_dmg: + for n in nb_points_in_dgm: + tstart = time() + for _ in range(nb_repeat): + X = [np.random.rand(n, 2) for _ in range(m)] + for diag in X: + #enforce having diagrams + diag[:,1] = diag[:,1] + diag[:,0] + _ = lagrangian_barycenter(X) + tend = time() + print("Computation of barycenter in %s sec, with k = %s diags and n = %s points per diag."%(np.round((tend - tstart)/nb_repeat, 2), m, n)) + print("********************") + + +def _sanity_check(verbose): + #dg1 = np.array([[0.2, 0.5]]) + #dg2 = np.array([[0.2, 0.7]]) + #dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]]) + #dg4 = np.array([[0.72, 0.82]]) + #X = [dg1, dg2, dg3, dg4] + #Y, a = lagrangian_barycenter(X, verbose=verbose) + #_plot_barycenter(X, Y, a) + + #dg1 = np.array([[0.2, 0.5]]) + #dg2 = np.array([]) # The empty diagram + #dg3 = np.array([[0.4, 0.8]]) + #X = [dg1, dg2, dg3] + #Y, a = lagrangian_barycenter(X, verbose=verbose) + #_plot_barycenter(X, Y, a) + + #dg1 = np.array([]) + #dg2 = np.array([]) # The empty diagram + #dg3 = np.array([]) + #X = [dg1, dg2, dg3] + #Y, a = lagrangian_barycenter(X, verbose=verbose) + #_plot_barycenter(X, Y, a) + + #dg1 = np.array([[0.1, 0.12], [0.21, 0.7], [0.4, 0.5], [0.3, 0.4], [0.35, 0.7], [0.5, 0.55], [0.32, 0.42], [0.1, 0.4], [0.2, 0.4]]) + #dg2 = np.array([[0.09, 0.11], [0.3, 0.43], [0.5, 0.61], [0.3, 0.7], [0.42, 0.5], [0.35, 0.41], [0.74, 0.9], [0.5, 0.95], [0.35, 0.45], [0.13, 0.48], [0.32, 0.45]]) + #dg3 = np.array([[0.1, 0.15], [0.1, 0.7], [0.2, 0.22], [0.55, 0.84], [0.11, 0.91], [0.61, 0.75], [0.33, 0.46], [0.12, 0.41], [0.32, 0.48]]) + #X = [dg1, dg2, dg3] + #Y, a = lagrangian_barycenter(X, init=1, verbose=verbose) + #_plot_barycenter(X, Y, a) + + + dg1 = np.array([[0.2, 0.5]]) + dg2 = np.array([[0.2, 0.7]]) + dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]]) + dg4 = np.array([]) + + bary = lagrangian_barycenter(pdiagset=[dg1, dg2, dg3, dg4],init=3) + + message = "Wasserstein barycenter estimated:" + print(message) + print(bary) if __name__=="__main__": - dg1 = np.array([[0.1, 0.12], [0.21, 0.7], [0.4, 0.5], [0.3, 0.4], [0.35, 0.7], [0.5, 0.55], [0.32, 0.42], [0.1, 0.4], [0.2, 0.4]]) - dg2 = np.array([[0.09, 0.11], [0.3, 0.43], [0.5, 0.61], [0.3, 0.7], [0.42, 0.5], [0.35, 0.41], [0.74, 0.9], [0.5, 0.95], [0.35, 0.45], [0.13, 0.48], [0.32, 0.45]]) - dg3 = np.array([[0.1, 0.15], [0.1, 0.7], [0.2, 0.22], [0.55, 0.84], [0.11, 0.91], [0.61, 0.75], [0.33, 0.46], [0.12, 0.41], [0.32, 0.48]]) - X = [dg1, dg2, dg3] - Y, a, e = lagrangian_barycenter(X, verbose=True) - _plot_barycenter(X, Y, a) - plt.show() + _sanity_check(verbose = True) + #_test_perf() -- cgit v1.2.3