created wasserstein repo

author: tlacombe <lacombe1993@gmail.com> 2020-03-31 15:02:32 +0200
committer: tlacombe <lacombe1993@gmail.com> 2020-03-31 15:02:32 +0200
commit: 842475615841f864b4ce41a2a4b69f1e189a2946 (patch)
tree: 3102227e3be0bb019b3baabaf79b575b260ff21c /src/python/gudhi/wasserstein
parent: 1aaffd2e1fab45988d92f5e51a9d294696ff5492 (diff)
3 files changed, 284 insertions, 0 deletions
diff --git a/src/python/gudhi/wasserstein/__init__.py b/src/python/gudhi/wasserstein/__init__.py
new file mode 100644
index 00000000..ed225ba4
--- /dev/null
+++ b/src/python/gudhi/wasserstein/__init__.py
@@ -0,0 +1 @@
+from .wasserstein import wasserstein_distance
diff --git a/src/python/gudhi/wasserstein/barycenter.py b/src/python/gudhi/wasserstein/barycenter.py
new file mode 100644
index 00000000..079bcc57
--- /dev/null
+++ b/src/python/gudhi/wasserstein/barycenter.py
@@ -0,0 +1,158 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Theo Lacombe
+#
+# Copyright (C) 2019 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+
+import ot
+import numpy as np
+import scipy.spatial.distance as sc
+
+from gudhi.wasserstein import wasserstein_distance
+
+
+def _mean(x, m):
+    '''
+    :param x: a list of 2D-points, off diagonal, x_0... x_{k-1}
+    :param m: total amount of points taken into account, 
+                that is we have (m-k) copies of diagonal
+    :returns: the weighted mean of x with (m-k) copies of the diagonal
+    '''
+    k = len(x)
+    if k > 0:
+        w = np.mean(x, axis=0)
+        w_delta = (w[0] + w[1]) / 2 * np.ones(2)
+        return (k * w + (m-k) * w_delta) / m
+    else:
+        return np.array([0, 0])
+
+
+def lagrangian_barycenter(pdiagset, init=None, verbose=False):
+    '''
+    :param pdiagset: a list of size m containing numpy.array of shape (n x 2) 
+                    (n can variate), encoding a set of 
+                    persistence diagrams with only finite coordinates. 
+    :param init: The initial value for barycenter estimate. 
+                    If None, init is made on a random diagram from the dataset. 
+                    Otherwise, it must be an int 
+                    (then we init with diagset[init])
+                    or a (n x 2) numpy.array enconding 
+                    a persistence diagram with n points.
+    :param verbose: if True, returns additional information about the
+                    barycenter.
+    :returns: If not verbose (default), a numpy.array encoding
+                    the barycenter estimate of pdiagset
+                    (local minima of the energy function). 
+                    If pdiagset is empty, returns None.
+                    If verbose, returns a couple (Y, log)
+                    where Y is the barycenter estimate,
+                    and log is a dict that contains additional informations:
+                    - groupings, a list of list of pairs (i,j),
+                    That is, G[k] = [(i, j) ...], where (i,j) indicates 
+                    that X[i] is matched to Y[j]
+                    if i = -1 or j = -1, it means they 
+                    represent the diagonal.
+                    - energy, a float representing the Frechet 
+                    energy value obtained,
+                    that is the mean of squared distances 
+                    of observations to the output.
+                    - nb_iter, integer representing the number of iterations 
+                    performed before convergence of the algorithm.
+    '''
+    X = pdiagset  # to shorten notations, not a copy
+    m = len(X)  # number of diagrams we are averaging
+    if m == 0:
+        print("Warning: computing barycenter of empty diag set. Returns None")
+        return None
+    
+    # store the number of off-diagonal point for each of the X_i
+    nb_off_diag = np.array([len(X_i) for X_i in X])  
+    # Initialisation of barycenter
+    if init is None:
+        i0 = np.random.randint(m)  # Index of first state for the barycenter
+        Y = X[i0].copy() 
+    else:
+        if type(init)==int:
+            Y = X[init].copy()
+        else:
+            Y = init.copy()
+
+    nb_iter = 0
+
+    converged = False  # stoping criterion
+    while not converged:
+        nb_iter += 1
+        K = len(Y)  # current nb of points in Y (some might be on diagonal)
+        G = np.full((K, m), -1, dtype=int)  # will store for each j, the (index)
+                              # point matched in each other diagram 
+                              #(might be the diagonal). 
+                              # that is G[j, i] = k <=> y_j is matched to
+                              # x_k in the diagram i-th diagram X[i]
+        updated_points = np.zeros((K, 2))  # will store the new positions of
+                                           # the points of Y.
+                                           # If points disappear, there thrown
+                                           # on [0,0] by default.
+        new_created_points = []  # will store potential new points.
+
+        # Step 1 : compute optimal matching (Y, X_i) for each X_i
+        #          and create new points in Y if needed
+        for i in range(m):
+            _, indices = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.)
+            for y_j, x_i_j in indices:
+                if y_j >= 0:  # we matched an off diagonal point to x_i_j...
+                    if x_i_j >= 0:  # ...which is also an off-diagonal point.
+                        G[y_j, i] = x_i_j
+                    else:  # ...which is a diagonal point
+                        G[y_j, i] = -1  # -1 stands for the diagonal (mask)
+                else:  # We matched a diagonal point to x_i_j...
+                    if x_i_j >= 0:  # which is a off-diag point ! 
+                                                # need to create new point in Y
+                        new_y = _mean(np.array([X[i][x_i_j]]), m)
+                        # Average this point with (m-1) copies of Delta
+                        new_created_points.append(new_y)
+
+        # Step 2 : Update current point position thanks to groupings computed
+        to_delete = []
+        for j in range(K):
+            matched_points = [X[i][G[j, i]] for i in range(m) if G[j, i] > -1]
+            new_y_j = _mean(matched_points, m)
+            if not np.array_equal(new_y_j, np.array([0,0])):
+                updated_points[j] = new_y_j 
+            else: # this points is no longer of any use.
+                to_delete.append(j)
+        # we remove the point to be deleted now.
+        updated_points = np.delete(updated_points, to_delete, axis=0)  
+
+        # we cannot converge if there have been new created points.
+        if new_created_points: 
+            Y = np.concatenate((updated_points, new_created_points))
+        else:
+            # Step 3 : we check convergence
+            if np.array_equal(updated_points, Y):
+                converged = True 
+            Y = updated_points
+
+
+    if verbose:
+        groupings = []
+        energy = 0
+        log = {}
+        n_y = len(Y)
+        for i in range(m):
+            cost, edges = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.)
+            groupings.append(edges)
+            energy += cost
+            log["groupings"] = groupings
+        energy = energy/m
+        print(energy)
+        log["energy"] = energy
+        log["nb_iter"] = nb_iter
+
+        return Y, log
+    else:
+        return Y
+
diff --git a/src/python/gudhi/wasserstein/wasserstein.py b/src/python/gudhi/wasserstein/wasserstein.py
new file mode 100644
index 00000000..e1233eec
--- /dev/null
+++ b/src/python/gudhi/wasserstein/wasserstein.py
@@ -0,0 +1,125 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Theo Lacombe
+#
+# Copyright (C) 2019 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+import numpy as np
+import scipy.spatial.distance as sc
+
+try:
+    import ot
+except ImportError:
+    print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT")
+
+def _proj_on_diag(X):
+    '''
+    :param X: (n x 2) array encoding the points of a persistent diagram.
+    :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal
+    '''
+    Z = (X[:,0] + X[:,1]) / 2.
+    return np.array([Z , Z]).T
+
+
+def _build_dist_matrix(X, Y, order=2., internal_p=2.):
+    '''
+    :param X: (n x 2) numpy.array encoding the (points of the) first diagram.
+    :param Y: (m x 2) numpy.array encoding the second diagram.
+    :param order: exponent for the Wasserstein metric.
+    :param internal_p: Ground metric (i.e. norm L^p).
+    :returns: (n+1) x (m+1) np.array encoding the cost matrix C. 
+                For 0 <= i < n, 0 <= j < m, C[i,j] encodes the distance between X[i] and Y[j], 
+                while C[i, m] (resp. C[n, j]) encodes the distance (to the p) between X[i] (resp Y[j]) 
+                and its orthogonal projection onto the diagonal.
+                note also that C[n, m] = 0  (it costs nothing to move from the diagonal to the diagonal).
+    '''
+    Xdiag = _proj_on_diag(X)
+    Ydiag = _proj_on_diag(Y)
+    if np.isinf(internal_p):
+        C = sc.cdist(X,Y, metric='chebyshev')**order
+        Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order
+        Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order
+    else:
+        C = sc.cdist(X,Y, metric='minkowski', p=internal_p)**order
+        Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order
+        Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order
+    Cf = np.hstack((C, Cxd[:,None]))
+    Cdy = np.append(Cdy, 0)
+
+    Cf = np.vstack((Cf, Cdy[None,:]))
+
+    return Cf
+
+
+def _perstot(X, order, internal_p):
+    '''
+    :param X: (n x 2) numpy.array (points of a given diagram).
+    :param order: exponent for Wasserstein. Default value is 2.
+    :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); Default value is 2 (Euclidean norm).
+    :returns: float, the total persistence of the diagram (that is, its distance to the empty diagram).    
+    '''
+    Xdiag = _proj_on_diag(X)
+    return (np.sum(np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order))**(1./order)
+
+
+def wasserstein_distance(X, Y, matching=False, order=2., internal_p=2.):
+    '''
+    :param X: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points 
+                (i.e. with infinite coordinate).
+    :param Y: (m x 2) numpy.array encoding the second diagram.
+    :param matching: if True, computes and returns the optimal matching between X and Y, encoded as
+                     a (n x 2) np.array  [...[i,j]...], meaning the i-th point in X is matched to
+                     the j-th point in Y, with the convention (-1) represents the diagonal.
+    :param order: exponent for Wasserstein; Default value is 2.
+    :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); 
+                       Default value is 2 (Euclidean norm).
+    :returns: the Wasserstein distance of order q (1 <= q < infinity) between persistence diagrams with 
+              respect to the internal_p-norm as ground metric.
+              If matching is set to True, also returns the optimal matching between X and Y.
+    '''
+    n = len(X)
+    m = len(Y)
+
+    # handle empty diagrams
+    if X.size == 0:
+        if Y.size == 0:
+            if not matching:
+                return 0.
+            else:
+                return 0., np.array([])
+        else:
+            if not matching:
+                return _perstot(Y, order, internal_p)
+            else:
+                return _perstot(Y, order, internal_p), np.array([[-1, j] for j in range(m)])
+    elif Y.size == 0:
+        if not matching:
+            return _perstot(X, order, internal_p)
+        else:
+            return _perstot(X, order, internal_p), np.array([[i, -1] for i in range(n)])
+
+    M = _build_dist_matrix(X, Y, order=order, internal_p=internal_p)
+    a = np.ones(n+1) # weight vector of the input diagram. Uniform here.
+    a[-1] = m
+    b = np.ones(m+1) # weight vector of the input diagram. Uniform here.
+    b[-1] = n
+
+    if matching:
+        P = ot.emd(a=a,b=b,M=M, numItermax=2000000)
+        ot_cost = np.sum(np.multiply(P,M))
+        P[-1, -1] = 0  # Remove matching corresponding to the diagonal
+        match = np.argwhere(P)
+        # Now we turn to -1 points encoding the diagonal
+        match[:,0][match[:,0] >= n] = -1
+        match[:,1][match[:,1] >= m] = -1
+        return ot_cost ** (1./order) , match
+
+    # Comptuation of the otcost using the ot.emd2 library.
+    # Note: it is the Wasserstein distance to the power q.
+    # The default numItermax=100000 is not sufficient for some examples with 5000 points, what is a good value?
+    ot_cost = ot.emd2(a, b, M, numItermax=2000000)
+
+    return ot_cost ** (1./order)
author	tlacombe <lacombe1993@gmail.com>	2020-03-31 15:02:32 +0200
committer	tlacombe <lacombe1993@gmail.com>	2020-03-31 15:02:32 +0200
commit	842475615841f864b4ce41a2a4b69f1e189a2946 (patch)
tree	3102227e3be0bb019b3baabaf79b575b260ff21c /src/python/gudhi/wasserstein
parent	1aaffd2e1fab45988d92f5e51a9d294696ff5492 (diff)