summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authortlacombe <lacombe1993@gmail.com>2019-12-05 18:42:48 +0100
committertlacombe <lacombe1993@gmail.com>2019-12-05 18:42:48 +0100
commit80aa14d1b92d1a61366d798b07073289d4db4fda (patch)
treed6fbe5c7bc404e91517154b3067f414f39bcb820 /src
parent48f7e17c5e9d4f6936bfdf6384015fe833e30c74 (diff)
first version of barycenter for persistence diagrams
Diffstat (limited to 'src')
-rw-r--r--src/python/doc/barycenter_sum.inc22
-rw-r--r--src/python/doc/barycenter_user.rst51
-rw-r--r--src/python/gudhi/barycenter.py322
3 files changed, 292 insertions, 103 deletions
diff --git a/src/python/doc/barycenter_sum.inc b/src/python/doc/barycenter_sum.inc
new file mode 100644
index 00000000..7801a845
--- /dev/null
+++ b/src/python/doc/barycenter_sum.inc
@@ -0,0 +1,22 @@
+.. table::
+ :widths: 30 50 20
+
+ +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+
+ | .. figure:: | A Frechet mean (or barycenter) is a generalization of the arithmetic | :Author: Theo Lacombe |
+ | ../../doc/Barycenter/barycenter.png | mean in a non linear space such as the one of persistence diagrams. | |
+ | :figclass: align-center | Given a set of persistence diagrams :math:`\mu_1 \dots \mu_n`, it is | :Introduced in: GUDHI 3.1.0 |
+ | | defined as a minimizer of the variance functional, that is of | |
+ | Illustration of Frechet mean between persistence | :math:`\mu \mapsto \sum_{i=1}^n d_2(\mu,\mu_i)^2`. | :Copyright: MIT |
+ | diagrams. | where :math:`d_2` denotes the Wasserstein-2 distance between persis- | |
+ | | tence diagrams. | |
+ | | It is known to exist and is generically unique. However, an exact | |
+ | | computation is in general untractable. Current implementation avai- | :Requires: Python Optimal Transport (POT) :math:`\geq` 0.5.1 |
+ | | -lable is based on [Turner et al, 2014], and uses an EM-scheme to | |
+ | | provide a local minimum of the variance functional (somewhat similar | |
+ | | to the Lloyd algorithm to estimate a solution to the k-means | |
+ | | problem). The combinatorial structure of the algorithm limits its | |
+ | | scaling on large scale problems (thousands of diagrams and of points | |
+ | | per diagram). | |
+ +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+
+ | * :doc:`barycenter_user` | |
+ +-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/src/python/doc/barycenter_user.rst b/src/python/doc/barycenter_user.rst
new file mode 100644
index 00000000..fae2854a
--- /dev/null
+++ b/src/python/doc/barycenter_user.rst
@@ -0,0 +1,51 @@
+:orphan:
+
+.. To get rid of WARNING: document isn't included in any toctree
+
+Wasserstein distance user manual
+================================
+Definition
+----------
+
+.. include:: wasserstein_distance_sum.inc
+
+This implementation is based on ideas from "Large Scale Computation of Means and Cluster for Persistence Diagrams via Optimal Transport".
+
+Function
+--------
+.. autofunction:: gudhi.barycenter.lagrangian_barycenter
+
+
+Basic example
+-------------
+
+This example computes the Frechet mean (aka Wasserstein barycenter) between four persistence diagrams.
+It is initialized on the 4th diagram, which is the empty diagram. It is encoded by np.array([]).
+Note that persistence diagrams must be submitted as (n x 2) numpy arrays and must not contain inf values.
+
+.. testcode::
+
+ import gudhi.barycenter
+ import numpy as np
+
+ dg1 = np.array([[0.2, 0.5]])
+ dg2 = np.array([[0.2, 0.7]])
+ dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]])
+ dg4 = np.array([])
+
+ bary = gudhi.barycenter.lagrangian_barycenter(pdiagset=[dg1, dg2, dg3, dg4],init=3))
+
+ message = "Wasserstein barycenter estimated:"
+ print(message)
+ print(bary)
+
+The output is:
+
+.. testoutput::
+
+ Wasserstein barycenter estimated:
+ [[0.27916667 0.55416667]
+ [0.7375 0.7625 ]
+ [0.2375 0.2625 ]]
+
+
diff --git a/src/python/gudhi/barycenter.py b/src/python/gudhi/barycenter.py
index 85666631..3cd214a7 100644
--- a/src/python/gudhi/barycenter.py
+++ b/src/python/gudhi/barycenter.py
@@ -1,75 +1,105 @@
import ot
import numpy as np
-import matplotlib.pyplot as plt
-from matplotlib.patches import Polygon
+import scipy.spatial.distance as sc
-def _proj_on_diag(x):
- """
- :param x: numpy.array of length 2, encoding a point on the upper half plane.
- :returns: numpy.array of length 2, orthogonal projection of the point onto
- the diagonal.
- """
- return np.array([(x[0] + x[1]) / 2, (x[0] + x[1]) / 2])
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Theo Lacombe
+#
+# Copyright (C) 2019 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
-def _norm2(x, y):
- """
- :param x: numpy.array of length 2, encoding a point on the upper half plane.
- :param y: numpy.array of length 2, encoding a point on the upper half plane.
- :returns: distance between the two points for the euclidean norm.
- """
- return (y[0] - x[0])**2 + (y[1] - x[1])**2
+def _proj_on_diag(w):
+ '''
+ Util function to project a point on the diag.
+ '''
+ return np.array([(w[0] + w[1])/2 , (w[0] + w[1])/2])
-def _cost_matrix(X, Y):
- """
- :param X: (n x 2) numpy.array encoding the first diagram
- :param Y: (m x 2) numpy.array encoding the second diagram
- :return: numpy.array with size (k x k) where k = |X| + |Y|, encoding the
- cost matrix between points (including the diagonal, with repetition to
- ensure one-to-one matchings.
- """
- n, m = len(X), len(Y)
- k = n + m
- M = np.zeros((k, k))
- for i in range(n): # go throught X points
- x_i = X[i]
- p_x_i = _proj_on_diag(x_i) # proj of x_i on the diagonal
- dist_x_delta = _norm2(x_i, p_x_i) # distance to the diagonal regarding the ground norm
- for j in range(m): # go throught d_2 points
- y_j = Y[j]
- p_y_j = _proj_on_diag(y_j)
- M[i, j] = _norm2(x_i, y_j)
- dist_y_delta = _norm2(y_j, p_y_j)
- for it in range(m):
- M[n + it, j] = dist_y_delta
- for it in range(n):
- M[i, m + it] = dist_x_delta
-
- return M
-
-
-def _optimal_matching(M):
+
+def _proj_on_diag_array(X):
+ '''
+ :param X: (n x 2) array encoding the points of a persistent diagram.
+ :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal
+ '''
+ Z = (X[:,0] + X[:,1]) / 2.
+ return np.array([Z , Z]).T
+
+
+def _build_dist_matrix(X, Y, p=2., q=2.):
+ '''
+ :param X: (n x 2) numpy.array encoding the (points of the) first diagram.
+ :param Y: (m x 2) numpy.array encoding the second diagram.
+ :param q: Ground metric (i.e. norm l_q).
+ :param p: exponent for the Wasserstein metric.
+ :returns: (n+1) x (m+1) np.array encoding the cost matrix C.
+ For 1 <= i <= n, 1 <= j <= m, C[i,j] encodes the distance between X[i] and Y[j], while C[i, m+1] (resp. C[n+1, j]) encodes the distance (to the p) between X[i] (resp Y[j]) and its orthogonal proj onto the diagonal.
+ note also that C[n+1, m+1] = 0 (it costs nothing to move from the diagonal to the diagonal).
+ Note that for lagrangian_barycenter, one must use p=q=2.
+ '''
+ Xdiag = _proj_on_diag_array(X)
+ Ydiag = _proj_on_diag_array(Y)
+ if np.isinf(q):
+ C = sc.cdist(X, Y, metric='chebyshev')**p
+ Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p
+ Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p
+ else:
+ C = sc.cdist(X,Y, metric='minkowski', p=q)**p
+ Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p
+ Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p
+ Cf = np.hstack((C, Cxd[:,None]))
+ Cdy = np.append(Cdy, 0)
+
+ Cf = np.vstack((Cf, Cdy[None,:]))
+
+ return Cf
+
+
+def _optimal_matching(X, Y):
"""
- :param M: numpy.array of size (k x k), encoding the cost matrix between the
- points of two diagrams.
- :returns: list of length (k) such that L[i] = j if and only if P[i,j]=1
- where P is a bi-stochastic matrix that minimize <P, M>.
+ :param X: numpy.array of size (n x 2)
+ :param Y: numpy.array of size (m x 2)
+ :returns: numpy.array of shape (k x 2) encoding the list of edges in the optimal matching.
+ That is, [[(i, j) ...], where (i,j) indicates that X[i] is matched to Y[j]
+ if i > len(X) or j > len(Y), it means they represent the diagonal.
+
"""
- n = len(M)
- # if input weights are empty lists, pot treats the uniform assignement
- # problem and returns a bistochastic matrix (up to *n).
- P = ot.emd(a=[], b=[], M=M) * n
- # return the list of indices j such that L[i] = j iff P[i,j] = 1
- return np.nonzero(P)[1]
+
+ n = len(X)
+ m = len(Y)
+ if X.size == 0: # X is empty
+ if Y.size == 0: # Y is empty
+ return np.array([[0,0]]) # the diagonal is matched to the diagonal and that's it...
+ else:
+ return np.column_stack([np.zeros(m+1, dtype=int), np.arange(m+1, dtype=int)]) # TO BE CORRECTED
+ elif Y.size == 0: # X is not empty but Y is empty
+ return np.column_stack([np.zeros(n+1, dtype=int), np.arange(n+1, dtype=int)]) # TO BE CORRECTED
+
+ # we know X, Y are not empty diags now
+ M = _build_dist_matrix(X, Y)
+
+ a = np.full(n+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here.
+ a[-1] = a[-1] * m # normalized so that we have a probability measure, required by POT
+ b = np.full(m+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here.
+ b[-1] = b[-1] * n # so that we have a probability measure, required by POT
+ P = ot.emd(a=a, b=b, M=M)*(n+m)
+ # Note : it seems POT return a permutation matrix in this situation,
+ # ...guarantee...?
+ # It should be enough to check that the algorithm only iterates on vertices of the transportation polytope.
+ P[P < 0.5] = 0 # dirty trick to avoid some numerical issues... to be improved.
+ # return the list of (i,j) such that P[i,j] > 0, i.e. x_i is matched to y_j (should it be the diag).
+ res = np.nonzero(P)
+ return np.column_stack(res)
def _mean(x, m):
"""
- :param x: a list of 2D-points, of diagonal, x_0... x_{k-1}
+ :param x: a list of 2D-points, off diagonal, x_0... x_{k-1}
:param m: total amount of points taken into account, that is we have (m-k) copies of diagonal
- :returns: the weighted mean of x with (m-k) copies of Delta taken into
- account.
+ :returns: the weighted mean of x with (m-k) copies of the diagonal
"""
k = len(x)
if k > 0:
@@ -88,44 +118,54 @@ def lagrangian_barycenter(pdiagset, init=None, verbose=False):
:param pdiagset: a list of size N containing numpy.array of shape (n x 2)
(n can variate), encoding a set of
persistence diagrams with only finite coordinates.
- :param init: The initial value for barycenter estimate.
- If None, init is made on a random diagram from the dataset.
- Otherwise, it must be a (n x 2) numpy.array enconding a persistence diagram with n points.
- :param verbose: if True, returns additional information about the
- barycenters (assignment and energy).
- :returns: If not verbose (default), a numpy.array encoding
+ :param init: The initial value for barycenter estimate.
+ If None, init is made on a random diagram from the dataset.
+ Otherwise, it must be an int (then we init with diagset[init])
+ or a (n x 2) numpy.array enconding a persistence diagram with n points.
+ :param verbose: if True, returns additional information about the
+ barycenters (assignment and energy).
+ :returns: If not verbose (default), a numpy.array encoding
the barycenter estimate (local minima of the energy function).
If verbose, returns a triplet (Y, a, e)
where Y is the barycenter estimate, a is the assignments between the
points of Y and thoses of the diagrams,
and e is the energy value reached by the estimate.
"""
- m = len(pdiagset) # number of diagrams we are averaging
- X = pdiagset # to shorten notations
+ X = pdiagset # to shorten notations, not a copy
+ m = len(X) # number of diagrams we are averaging
+ if m == 0:
+ print("Warning: computing barycenter of empty diag set. Returns None")
+ return None
+
nb_off_diag = np.array([len(X_i) for X_i in X]) # store the number of off-diagonal point for each of the X_i
# Initialisation of barycenter
if init is None:
i0 = np.random.randint(m) # Index of first state for the barycenter
- Y = X[i0].copy()
+ Y = X[i0].copy() #copy() ensure that we do not modify X[i0]
else:
- Y = init.copy()
+ if type(init)==int:
+ Y = X[init].copy()
+ else:
+ Y = init.copy()
- not_converged = True # stoping criterion
- while not_converged:
+ converged = False # stoping criterion
+ while not converged:
K = len(Y) # current nb of points in Y (some might be on diagonal)
- G = np.zeros((K, m)) # will store for each j, the (index) point matched in each other diagram (might be the diagonal).
+ G = np.zeros((K, m), dtype=int)-1 # will store for each j, the (index) point matched in each other diagram (might be the diagonal).
+ # that is G[j, i] = k <=> y_j is matched to
+ # x_k in the diagram i-th diagram X[i]
updated_points = np.zeros((K, 2)) # will store the new positions of
# the points of Y.
# If points disappear, there thrown
# on [0,0] by default.
- new_created_points = [] # will store eventual new points.
+ new_created_points = [] # will store potential new points.
# Step 1 : compute optimal matching (Y, X_i) for each X_i
+ # and create new points in Y if needed
for i in range(m):
- M = _cost_matrix(Y, X[i])
- indices = _optimal_matching(M)
- for y_j, x_i_j in enumerate(indices):
+ indices = _optimal_matching(Y, X[i])
+ for y_j, x_i_j in indices:
if y_j < K: # we matched an off diagonal point to x_i_j...
if x_i_j < nb_off_diag[i]: # ...which is also an off-diagonal point
G[y_j, i] = x_i_j
@@ -136,32 +176,40 @@ def lagrangian_barycenter(pdiagset, init=None, verbose=False):
new_y = _mean(np.array([X[i][x_i_j]]), m) # Average this point with (m-1) copies of Delta
new_created_points.append(new_y)
- # Step 2 : Compute new points (mean)
+ # Step 2 : Update current point position thanks to the groupings computed
+
+ to_delete = []
for j in range(K):
- matched_points = [X[i][int(G[j, i])] for i in range(m) if G[j, i] > -1]
- updated_points[j] = _mean(matched_points, m)
+ matched_points = [X[i][G[j, i]] for i in range(m) if G[j, i] > -1]
+ new_y_j = _mean(matched_points, m)
+ if not np.array_equal(new_y_j, np.array([0,0])):
+ updated_points[j] = new_y_j
+ else: # this points is no longer of any use.
+ to_delete.append(j)
+ # we remove the point to be deleted now.
+ updated_points = np.delete(updated_points, to_delete, axis=0) # cannot be done in-place.
- if new_created_points:
+
+ if new_created_points: # we cannot converge if there have been new created points.
Y = np.concatenate((updated_points, new_created_points))
else:
+ # Step 3 : we check convergence
+ if np.array_equal(updated_points, Y):
+ converged = True
Y = updated_points
- # Step 3 : we update our estimation of the barycenter
- if len(new_created_points) == 0 and np.array_equal(updated_points, Y):
- not_converged = False
if verbose:
matchings = []
- energy = 0
+ #energy = 0
n_y = len(Y)
for i in range(m):
- M = _cost_matrix(Y, X[i])
- edges = _optimal_matching(M)
+ edges = _optimal_matching(Y, X[i])
matchings.append([x_i_j for (y_j, x_i_j) in enumerate(edges) if y_j < n_y])
- energy += sum([M[i,j] for i,j in enumerate(edges)])
+ # energy += sum([M[i,j] for i,j in enumerate(edges)])
- energy = energy/m
- return Y, matchings, energy
+ # energy = energy/m
+ return Y, matchings #, energy
else:
return Y
@@ -174,6 +222,11 @@ def _plot_barycenter(X, Y, matchings):
the i-th point of the barycenter is grouped with the j-th point of the k-th
diagram.
"""
+ # import matplotlib now to avoid useless dependancies
+
+ import matplotlib.pyplot as plt
+ from matplotlib.patches import Polygon
+
fig = plt.figure()
ax = fig.add_subplot(111)
@@ -182,7 +235,7 @@ def _plot_barycenter(X, Y, matchings):
indices = matchings[i]
n_i = len(X[i])
- for (y_j, x_i_j) in enumerate(indices):
+ for (y_j, x_i_j) in indices:
y = Y[y_j]
if y[0] != y[1]:
if x_i_j < n_i: # not mapped with the diag
@@ -192,16 +245,20 @@ def _plot_barycenter(X, Y, matchings):
ax.plot([y[0], x[0]], [y[1], x[1]], c='black',
linestyle="dashed")
- ax.scatter(Y[:,0], Y[:,1], color='purple', marker='d')
+ ax.scatter(Y[:,0], Y[:,1], color='purple', marker='d', zorder=2)
- for dgm in X:
- ax.scatter(dgm[:,0], dgm[:,1], marker ='o')
+ for X_i in X:
+ if X_i.size > 0:
+ ax.scatter(X_i[:,0], X_i[:,1], marker ='o', zorder=2)
shift = 0.1 # for improved rendering
- xmin = min([np.min(x[:,0]) for x in X]) - shift
- xmax = max([np.max(x[:,0]) for x in X]) + shift
- ymin = min([np.max(x[:,1]) for x in X]) - shift
- ymax = max([np.max(x[:,1]) for x in X]) + shift
+ try:
+ xmin = np.min(np.array([np.min(x[:,0]) for x in X if len(x) > 0]) - shift)
+ xmax = np.max(np.array([np.max(x[:,0]) for x in X if len(x) > 0]) + shift)
+ ymin = np.min(np.array([np.max(x[:,1]) for x in X if len(x) > 0]) - shift)
+ ymax = np.max(np.array([np.max(x[:,1]) for x in X if len(x) > 0]) + shift)
+ except ValueError: # to handle the pecular case where we only average empty diagrams.
+ xmin, xmax, ymin, ymax = 0, 1, 0, 1
themin = min(xmin, ymin)
themax = max(xmax, ymax)
ax.set_xlim(themin, themax)
@@ -212,12 +269,71 @@ def _plot_barycenter(X, Y, matchings):
ax.set_aspect('equal', adjustable='box')
ax.set_title("Estimated barycenter")
+ plt.show()
+
+
+def _test_perf():
+ nb_repeat = 10
+ nb_points_in_dgm = [5, 10, 20, 50, 100]
+ nb_dmg = [3, 5, 10, 20]
+
+ from time import time
+ for m in nb_dmg:
+ for n in nb_points_in_dgm:
+ tstart = time()
+ for _ in range(nb_repeat):
+ X = [np.random.rand(n, 2) for _ in range(m)]
+ for diag in X:
+ #enforce having diagrams
+ diag[:,1] = diag[:,1] + diag[:,0]
+ _ = lagrangian_barycenter(X)
+ tend = time()
+ print("Computation of barycenter in %s sec, with k = %s diags and n = %s points per diag."%(np.round((tend - tstart)/nb_repeat, 2), m, n))
+ print("********************")
+
+
+def _sanity_check(verbose):
+ #dg1 = np.array([[0.2, 0.5]])
+ #dg2 = np.array([[0.2, 0.7]])
+ #dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]])
+ #dg4 = np.array([[0.72, 0.82]])
+ #X = [dg1, dg2, dg3, dg4]
+ #Y, a = lagrangian_barycenter(X, verbose=verbose)
+ #_plot_barycenter(X, Y, a)
+
+ #dg1 = np.array([[0.2, 0.5]])
+ #dg2 = np.array([]) # The empty diagram
+ #dg3 = np.array([[0.4, 0.8]])
+ #X = [dg1, dg2, dg3]
+ #Y, a = lagrangian_barycenter(X, verbose=verbose)
+ #_plot_barycenter(X, Y, a)
+
+ #dg1 = np.array([])
+ #dg2 = np.array([]) # The empty diagram
+ #dg3 = np.array([])
+ #X = [dg1, dg2, dg3]
+ #Y, a = lagrangian_barycenter(X, verbose=verbose)
+ #_plot_barycenter(X, Y, a)
+
+ #dg1 = np.array([[0.1, 0.12], [0.21, 0.7], [0.4, 0.5], [0.3, 0.4], [0.35, 0.7], [0.5, 0.55], [0.32, 0.42], [0.1, 0.4], [0.2, 0.4]])
+ #dg2 = np.array([[0.09, 0.11], [0.3, 0.43], [0.5, 0.61], [0.3, 0.7], [0.42, 0.5], [0.35, 0.41], [0.74, 0.9], [0.5, 0.95], [0.35, 0.45], [0.13, 0.48], [0.32, 0.45]])
+ #dg3 = np.array([[0.1, 0.15], [0.1, 0.7], [0.2, 0.22], [0.55, 0.84], [0.11, 0.91], [0.61, 0.75], [0.33, 0.46], [0.12, 0.41], [0.32, 0.48]])
+ #X = [dg1, dg2, dg3]
+ #Y, a = lagrangian_barycenter(X, init=1, verbose=verbose)
+ #_plot_barycenter(X, Y, a)
+
+
+ dg1 = np.array([[0.2, 0.5]])
+ dg2 = np.array([[0.2, 0.7]])
+ dg3 = np.array([[0.3, 0.6], [0.7, 0.8], [0.2, 0.3]])
+ dg4 = np.array([])
+
+ bary = lagrangian_barycenter(pdiagset=[dg1, dg2, dg3, dg4],init=3)
+
+ message = "Wasserstein barycenter estimated:"
+ print(message)
+ print(bary)
if __name__=="__main__":
- dg1 = np.array([[0.1, 0.12], [0.21, 0.7], [0.4, 0.5], [0.3, 0.4], [0.35, 0.7], [0.5, 0.55], [0.32, 0.42], [0.1, 0.4], [0.2, 0.4]])
- dg2 = np.array([[0.09, 0.11], [0.3, 0.43], [0.5, 0.61], [0.3, 0.7], [0.42, 0.5], [0.35, 0.41], [0.74, 0.9], [0.5, 0.95], [0.35, 0.45], [0.13, 0.48], [0.32, 0.45]])
- dg3 = np.array([[0.1, 0.15], [0.1, 0.7], [0.2, 0.22], [0.55, 0.84], [0.11, 0.91], [0.61, 0.75], [0.33, 0.46], [0.12, 0.41], [0.32, 0.48]])
- X = [dg1, dg2, dg3]
- Y, a, e = lagrangian_barycenter(X, verbose=True)
- _plot_barycenter(X, Y, a)
- plt.show()
+ _sanity_check(verbose = True)
+ #_test_perf()