From 842475615841f864b4ce41a2a4b69f1e189a2946 Mon Sep 17 00:00:00 2001 From: tlacombe Date: Tue, 31 Mar 2020 15:02:32 +0200 Subject: created wasserstein repo --- src/python/gudhi/barycenter.py | 158 ---------------------------- src/python/gudhi/wasserstein.py | 125 ---------------------- src/python/gudhi/wasserstein/__init__.py | 1 + src/python/gudhi/wasserstein/barycenter.py | 158 ++++++++++++++++++++++++++++ src/python/gudhi/wasserstein/wasserstein.py | 125 ++++++++++++++++++++++ 5 files changed, 284 insertions(+), 283 deletions(-) delete mode 100644 src/python/gudhi/barycenter.py delete mode 100644 src/python/gudhi/wasserstein.py create mode 100644 src/python/gudhi/wasserstein/__init__.py create mode 100644 src/python/gudhi/wasserstein/barycenter.py create mode 100644 src/python/gudhi/wasserstein/wasserstein.py (limited to 'src/python') diff --git a/src/python/gudhi/barycenter.py b/src/python/gudhi/barycenter.py deleted file mode 100644 index 079bcc57..00000000 --- a/src/python/gudhi/barycenter.py +++ /dev/null @@ -1,158 +0,0 @@ -# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. -# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. -# Author(s): Theo Lacombe -# -# Copyright (C) 2019 Inria -# -# Modification(s): -# - YYYY/MM Author: Description of the modification - - -import ot -import numpy as np -import scipy.spatial.distance as sc - -from gudhi.wasserstein import wasserstein_distance - - -def _mean(x, m): - ''' - :param x: a list of 2D-points, off diagonal, x_0... x_{k-1} - :param m: total amount of points taken into account, - that is we have (m-k) copies of diagonal - :returns: the weighted mean of x with (m-k) copies of the diagonal - ''' - k = len(x) - if k > 0: - w = np.mean(x, axis=0) - w_delta = (w[0] + w[1]) / 2 * np.ones(2) - return (k * w + (m-k) * w_delta) / m - else: - return np.array([0, 0]) - - -def lagrangian_barycenter(pdiagset, init=None, verbose=False): - ''' - :param pdiagset: a list of size m containing numpy.array of shape (n x 2) - (n can variate), encoding a set of - persistence diagrams with only finite coordinates. - :param init: The initial value for barycenter estimate. - If None, init is made on a random diagram from the dataset. - Otherwise, it must be an int - (then we init with diagset[init]) - or a (n x 2) numpy.array enconding - a persistence diagram with n points. - :param verbose: if True, returns additional information about the - barycenter. - :returns: If not verbose (default), a numpy.array encoding - the barycenter estimate of pdiagset - (local minima of the energy function). - If pdiagset is empty, returns None. - If verbose, returns a couple (Y, log) - where Y is the barycenter estimate, - and log is a dict that contains additional informations: - - groupings, a list of list of pairs (i,j), - That is, G[k] = [(i, j) ...], where (i,j) indicates - that X[i] is matched to Y[j] - if i = -1 or j = -1, it means they - represent the diagonal. - - energy, a float representing the Frechet - energy value obtained, - that is the mean of squared distances - of observations to the output. - - nb_iter, integer representing the number of iterations - performed before convergence of the algorithm. - ''' - X = pdiagset # to shorten notations, not a copy - m = len(X) # number of diagrams we are averaging - if m == 0: - print("Warning: computing barycenter of empty diag set. Returns None") - return None - - # store the number of off-diagonal point for each of the X_i - nb_off_diag = np.array([len(X_i) for X_i in X]) - # Initialisation of barycenter - if init is None: - i0 = np.random.randint(m) # Index of first state for the barycenter - Y = X[i0].copy() - else: - if type(init)==int: - Y = X[init].copy() - else: - Y = init.copy() - - nb_iter = 0 - - converged = False # stoping criterion - while not converged: - nb_iter += 1 - K = len(Y) # current nb of points in Y (some might be on diagonal) - G = np.full((K, m), -1, dtype=int) # will store for each j, the (index) - # point matched in each other diagram - #(might be the diagonal). - # that is G[j, i] = k <=> y_j is matched to - # x_k in the diagram i-th diagram X[i] - updated_points = np.zeros((K, 2)) # will store the new positions of - # the points of Y. - # If points disappear, there thrown - # on [0,0] by default. - new_created_points = [] # will store potential new points. - - # Step 1 : compute optimal matching (Y, X_i) for each X_i - # and create new points in Y if needed - for i in range(m): - _, indices = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.) - for y_j, x_i_j in indices: - if y_j >= 0: # we matched an off diagonal point to x_i_j... - if x_i_j >= 0: # ...which is also an off-diagonal point. - G[y_j, i] = x_i_j - else: # ...which is a diagonal point - G[y_j, i] = -1 # -1 stands for the diagonal (mask) - else: # We matched a diagonal point to x_i_j... - if x_i_j >= 0: # which is a off-diag point ! - # need to create new point in Y - new_y = _mean(np.array([X[i][x_i_j]]), m) - # Average this point with (m-1) copies of Delta - new_created_points.append(new_y) - - # Step 2 : Update current point position thanks to groupings computed - to_delete = [] - for j in range(K): - matched_points = [X[i][G[j, i]] for i in range(m) if G[j, i] > -1] - new_y_j = _mean(matched_points, m) - if not np.array_equal(new_y_j, np.array([0,0])): - updated_points[j] = new_y_j - else: # this points is no longer of any use. - to_delete.append(j) - # we remove the point to be deleted now. - updated_points = np.delete(updated_points, to_delete, axis=0) - - # we cannot converge if there have been new created points. - if new_created_points: - Y = np.concatenate((updated_points, new_created_points)) - else: - # Step 3 : we check convergence - if np.array_equal(updated_points, Y): - converged = True - Y = updated_points - - - if verbose: - groupings = [] - energy = 0 - log = {} - n_y = len(Y) - for i in range(m): - cost, edges = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.) - groupings.append(edges) - energy += cost - log["groupings"] = groupings - energy = energy/m - print(energy) - log["energy"] = energy - log["nb_iter"] = nb_iter - - return Y, log - else: - return Y - diff --git a/src/python/gudhi/wasserstein.py b/src/python/gudhi/wasserstein.py deleted file mode 100644 index 51d1d83c..00000000 --- a/src/python/gudhi/wasserstein.py +++ /dev/null @@ -1,125 +0,0 @@ -# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. -# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. -# Author(s): Theo Lacombe -# -# Copyright (C) 2019 Inria -# -# Modification(s): -# - YYYY/MM Author: Description of the modification - -import numpy as np -import scipy.spatial.distance as sc -import gudhi.barycenter as barycenter -try: - import ot -except ImportError: - print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT") - -def _proj_on_diag(X): - ''' - :param X: (n x 2) array encoding the points of a persistent diagram. - :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal - ''' - Z = (X[:,0] + X[:,1]) / 2. - return np.array([Z , Z]).T - - -def _build_dist_matrix(X, Y, order=2., internal_p=2.): - ''' - :param X: (n x 2) numpy.array encoding the (points of the) first diagram. - :param Y: (m x 2) numpy.array encoding the second diagram. - :param order: exponent for the Wasserstein metric. - :param internal_p: Ground metric (i.e. norm L^p). - :returns: (n+1) x (m+1) np.array encoding the cost matrix C. - For 0 <= i < n, 0 <= j < m, C[i,j] encodes the distance between X[i] and Y[j], - while C[i, m] (resp. C[n, j]) encodes the distance (to the p) between X[i] (resp Y[j]) - and its orthogonal projection onto the diagonal. - note also that C[n, m] = 0 (it costs nothing to move from the diagonal to the diagonal). - ''' - Xdiag = _proj_on_diag(X) - Ydiag = _proj_on_diag(Y) - if np.isinf(internal_p): - C = sc.cdist(X,Y, metric='chebyshev')**order - Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order - Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order - else: - C = sc.cdist(X,Y, metric='minkowski', p=internal_p)**order - Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order - Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order - Cf = np.hstack((C, Cxd[:,None])) - Cdy = np.append(Cdy, 0) - - Cf = np.vstack((Cf, Cdy[None,:])) - - return Cf - - -def _perstot(X, order, internal_p): - ''' - :param X: (n x 2) numpy.array (points of a given diagram). - :param order: exponent for Wasserstein. Default value is 2. - :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); Default value is 2 (Euclidean norm). - :returns: float, the total persistence of the diagram (that is, its distance to the empty diagram). - ''' - Xdiag = _proj_on_diag(X) - return (np.sum(np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order))**(1./order) - - -def wasserstein_distance(X, Y, matching=False, order=2., internal_p=2.): - ''' - :param X: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points - (i.e. with infinite coordinate). - :param Y: (m x 2) numpy.array encoding the second diagram. - :param matching: if True, computes and returns the optimal matching between X and Y, encoded as - a (n x 2) np.array [...[i,j]...], meaning the i-th point in X is matched to - the j-th point in Y, with the convention (-1) represents the diagonal. - :param order: exponent for Wasserstein; Default value is 2. - :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); - Default value is 2 (Euclidean norm). - :returns: the Wasserstein distance of order q (1 <= q < infinity) between persistence diagrams with - respect to the internal_p-norm as ground metric. - If matching is set to True, also returns the optimal matching between X and Y. - ''' - n = len(X) - m = len(Y) - - # handle empty diagrams - if X.size == 0: - if Y.size == 0: - if not matching: - return 0. - else: - return 0., np.array([]) - else: - if not matching: - return _perstot(Y, order, internal_p) - else: - return _perstot(Y, order, internal_p), np.array([[-1, j] for j in range(m)]) - elif Y.size == 0: - if not matching: - return _perstot(X, order, internal_p) - else: - return _perstot(X, order, internal_p), np.array([[i, -1] for i in range(n)]) - - M = _build_dist_matrix(X, Y, order=order, internal_p=internal_p) - a = np.ones(n+1) # weight vector of the input diagram. Uniform here. - a[-1] = m - b = np.ones(m+1) # weight vector of the input diagram. Uniform here. - b[-1] = n - - if matching: - P = ot.emd(a=a,b=b,M=M, numItermax=2000000) - ot_cost = np.sum(np.multiply(P,M)) - P[-1, -1] = 0 # Remove matching corresponding to the diagonal - match = np.argwhere(P) - # Now we turn to -1 points encoding the diagonal - match[:,0][match[:,0] >= n] = -1 - match[:,1][match[:,1] >= m] = -1 - return ot_cost ** (1./order) , match - - # Comptuation of the otcost using the ot.emd2 library. - # Note: it is the Wasserstein distance to the power q. - # The default numItermax=100000 is not sufficient for some examples with 5000 points, what is a good value? - ot_cost = ot.emd2(a, b, M, numItermax=2000000) - - return ot_cost ** (1./order) diff --git a/src/python/gudhi/wasserstein/__init__.py b/src/python/gudhi/wasserstein/__init__.py new file mode 100644 index 00000000..ed225ba4 --- /dev/null +++ b/src/python/gudhi/wasserstein/__init__.py @@ -0,0 +1 @@ +from .wasserstein import wasserstein_distance diff --git a/src/python/gudhi/wasserstein/barycenter.py b/src/python/gudhi/wasserstein/barycenter.py new file mode 100644 index 00000000..079bcc57 --- /dev/null +++ b/src/python/gudhi/wasserstein/barycenter.py @@ -0,0 +1,158 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Theo Lacombe +# +# Copyright (C) 2019 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + + +import ot +import numpy as np +import scipy.spatial.distance as sc + +from gudhi.wasserstein import wasserstein_distance + + +def _mean(x, m): + ''' + :param x: a list of 2D-points, off diagonal, x_0... x_{k-1} + :param m: total amount of points taken into account, + that is we have (m-k) copies of diagonal + :returns: the weighted mean of x with (m-k) copies of the diagonal + ''' + k = len(x) + if k > 0: + w = np.mean(x, axis=0) + w_delta = (w[0] + w[1]) / 2 * np.ones(2) + return (k * w + (m-k) * w_delta) / m + else: + return np.array([0, 0]) + + +def lagrangian_barycenter(pdiagset, init=None, verbose=False): + ''' + :param pdiagset: a list of size m containing numpy.array of shape (n x 2) + (n can variate), encoding a set of + persistence diagrams with only finite coordinates. + :param init: The initial value for barycenter estimate. + If None, init is made on a random diagram from the dataset. + Otherwise, it must be an int + (then we init with diagset[init]) + or a (n x 2) numpy.array enconding + a persistence diagram with n points. + :param verbose: if True, returns additional information about the + barycenter. + :returns: If not verbose (default), a numpy.array encoding + the barycenter estimate of pdiagset + (local minima of the energy function). + If pdiagset is empty, returns None. + If verbose, returns a couple (Y, log) + where Y is the barycenter estimate, + and log is a dict that contains additional informations: + - groupings, a list of list of pairs (i,j), + That is, G[k] = [(i, j) ...], where (i,j) indicates + that X[i] is matched to Y[j] + if i = -1 or j = -1, it means they + represent the diagonal. + - energy, a float representing the Frechet + energy value obtained, + that is the mean of squared distances + of observations to the output. + - nb_iter, integer representing the number of iterations + performed before convergence of the algorithm. + ''' + X = pdiagset # to shorten notations, not a copy + m = len(X) # number of diagrams we are averaging + if m == 0: + print("Warning: computing barycenter of empty diag set. Returns None") + return None + + # store the number of off-diagonal point for each of the X_i + nb_off_diag = np.array([len(X_i) for X_i in X]) + # Initialisation of barycenter + if init is None: + i0 = np.random.randint(m) # Index of first state for the barycenter + Y = X[i0].copy() + else: + if type(init)==int: + Y = X[init].copy() + else: + Y = init.copy() + + nb_iter = 0 + + converged = False # stoping criterion + while not converged: + nb_iter += 1 + K = len(Y) # current nb of points in Y (some might be on diagonal) + G = np.full((K, m), -1, dtype=int) # will store for each j, the (index) + # point matched in each other diagram + #(might be the diagonal). + # that is G[j, i] = k <=> y_j is matched to + # x_k in the diagram i-th diagram X[i] + updated_points = np.zeros((K, 2)) # will store the new positions of + # the points of Y. + # If points disappear, there thrown + # on [0,0] by default. + new_created_points = [] # will store potential new points. + + # Step 1 : compute optimal matching (Y, X_i) for each X_i + # and create new points in Y if needed + for i in range(m): + _, indices = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.) + for y_j, x_i_j in indices: + if y_j >= 0: # we matched an off diagonal point to x_i_j... + if x_i_j >= 0: # ...which is also an off-diagonal point. + G[y_j, i] = x_i_j + else: # ...which is a diagonal point + G[y_j, i] = -1 # -1 stands for the diagonal (mask) + else: # We matched a diagonal point to x_i_j... + if x_i_j >= 0: # which is a off-diag point ! + # need to create new point in Y + new_y = _mean(np.array([X[i][x_i_j]]), m) + # Average this point with (m-1) copies of Delta + new_created_points.append(new_y) + + # Step 2 : Update current point position thanks to groupings computed + to_delete = [] + for j in range(K): + matched_points = [X[i][G[j, i]] for i in range(m) if G[j, i] > -1] + new_y_j = _mean(matched_points, m) + if not np.array_equal(new_y_j, np.array([0,0])): + updated_points[j] = new_y_j + else: # this points is no longer of any use. + to_delete.append(j) + # we remove the point to be deleted now. + updated_points = np.delete(updated_points, to_delete, axis=0) + + # we cannot converge if there have been new created points. + if new_created_points: + Y = np.concatenate((updated_points, new_created_points)) + else: + # Step 3 : we check convergence + if np.array_equal(updated_points, Y): + converged = True + Y = updated_points + + + if verbose: + groupings = [] + energy = 0 + log = {} + n_y = len(Y) + for i in range(m): + cost, edges = wasserstein_distance(Y, X[i], matching=True, order=2., internal_p=2.) + groupings.append(edges) + energy += cost + log["groupings"] = groupings + energy = energy/m + print(energy) + log["energy"] = energy + log["nb_iter"] = nb_iter + + return Y, log + else: + return Y + diff --git a/src/python/gudhi/wasserstein/wasserstein.py b/src/python/gudhi/wasserstein/wasserstein.py new file mode 100644 index 00000000..e1233eec --- /dev/null +++ b/src/python/gudhi/wasserstein/wasserstein.py @@ -0,0 +1,125 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Theo Lacombe +# +# Copyright (C) 2019 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +import numpy as np +import scipy.spatial.distance as sc + +try: + import ot +except ImportError: + print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT") + +def _proj_on_diag(X): + ''' + :param X: (n x 2) array encoding the points of a persistent diagram. + :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal + ''' + Z = (X[:,0] + X[:,1]) / 2. + return np.array([Z , Z]).T + + +def _build_dist_matrix(X, Y, order=2., internal_p=2.): + ''' + :param X: (n x 2) numpy.array encoding the (points of the) first diagram. + :param Y: (m x 2) numpy.array encoding the second diagram. + :param order: exponent for the Wasserstein metric. + :param internal_p: Ground metric (i.e. norm L^p). + :returns: (n+1) x (m+1) np.array encoding the cost matrix C. + For 0 <= i < n, 0 <= j < m, C[i,j] encodes the distance between X[i] and Y[j], + while C[i, m] (resp. C[n, j]) encodes the distance (to the p) between X[i] (resp Y[j]) + and its orthogonal projection onto the diagonal. + note also that C[n, m] = 0 (it costs nothing to move from the diagonal to the diagonal). + ''' + Xdiag = _proj_on_diag(X) + Ydiag = _proj_on_diag(Y) + if np.isinf(internal_p): + C = sc.cdist(X,Y, metric='chebyshev')**order + Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order + Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order + else: + C = sc.cdist(X,Y, metric='minkowski', p=internal_p)**order + Cxd = np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order + Cdy = np.linalg.norm(Y - Ydiag, ord=internal_p, axis=1)**order + Cf = np.hstack((C, Cxd[:,None])) + Cdy = np.append(Cdy, 0) + + Cf = np.vstack((Cf, Cdy[None,:])) + + return Cf + + +def _perstot(X, order, internal_p): + ''' + :param X: (n x 2) numpy.array (points of a given diagram). + :param order: exponent for Wasserstein. Default value is 2. + :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); Default value is 2 (Euclidean norm). + :returns: float, the total persistence of the diagram (that is, its distance to the empty diagram). + ''' + Xdiag = _proj_on_diag(X) + return (np.sum(np.linalg.norm(X - Xdiag, ord=internal_p, axis=1)**order))**(1./order) + + +def wasserstein_distance(X, Y, matching=False, order=2., internal_p=2.): + ''' + :param X: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points + (i.e. with infinite coordinate). + :param Y: (m x 2) numpy.array encoding the second diagram. + :param matching: if True, computes and returns the optimal matching between X and Y, encoded as + a (n x 2) np.array [...[i,j]...], meaning the i-th point in X is matched to + the j-th point in Y, with the convention (-1) represents the diagonal. + :param order: exponent for Wasserstein; Default value is 2. + :param internal_p: Ground metric on the (upper-half) plane (i.e. norm L^p in R^2); + Default value is 2 (Euclidean norm). + :returns: the Wasserstein distance of order q (1 <= q < infinity) between persistence diagrams with + respect to the internal_p-norm as ground metric. + If matching is set to True, also returns the optimal matching between X and Y. + ''' + n = len(X) + m = len(Y) + + # handle empty diagrams + if X.size == 0: + if Y.size == 0: + if not matching: + return 0. + else: + return 0., np.array([]) + else: + if not matching: + return _perstot(Y, order, internal_p) + else: + return _perstot(Y, order, internal_p), np.array([[-1, j] for j in range(m)]) + elif Y.size == 0: + if not matching: + return _perstot(X, order, internal_p) + else: + return _perstot(X, order, internal_p), np.array([[i, -1] for i in range(n)]) + + M = _build_dist_matrix(X, Y, order=order, internal_p=internal_p) + a = np.ones(n+1) # weight vector of the input diagram. Uniform here. + a[-1] = m + b = np.ones(m+1) # weight vector of the input diagram. Uniform here. + b[-1] = n + + if matching: + P = ot.emd(a=a,b=b,M=M, numItermax=2000000) + ot_cost = np.sum(np.multiply(P,M)) + P[-1, -1] = 0 # Remove matching corresponding to the diagonal + match = np.argwhere(P) + # Now we turn to -1 points encoding the diagonal + match[:,0][match[:,0] >= n] = -1 + match[:,1][match[:,1] >= m] = -1 + return ot_cost ** (1./order) , match + + # Comptuation of the otcost using the ot.emd2 library. + # Note: it is the Wasserstein distance to the power q. + # The default numItermax=100000 is not sufficient for some examples with 5000 points, what is a good value? + ot_cost = ot.emd2(a, b, M, numItermax=2000000) + + return ot_cost ** (1./order) -- cgit v1.2.3