From 9076f02903ba2fb9ea9fe704764a755cad8dcd63 Mon Sep 17 00:00:00 2001 From: Cédric Vincent-Cuaz Date: Mon, 12 Jun 2023 12:01:48 +0200 Subject: [FEAT] Entropic gw/fgw/srgw/srfgw solvers (#455) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add entropic fgw + fgw bary + srgw + srfgw with tests * add exemples for entropic srgw - srfgw solvers * add PPA solvers for GW/FGW + complete previous commits * update readme * add tests * add examples + tests + warning in entropic solvers + releases * reduce testing runtimes for test_gromov * fix conflicts * optional marginals * improve coverage * gromov doc harmonization * fix pep8 * complete optional marginal for entropic srfgw --------- Co-authored-by: Rémi Flamary --- ot/gromov/__init__.py | 37 ++- ot/gromov/_bregman.py | 782 ++++++++++++++++++++++++++++++++++++++++++---- ot/gromov/_gw.py | 318 ++++++++++--------- ot/gromov/_semirelaxed.py | 591 +++++++++++++++++++++++++++++++++-- ot/gromov/_utils.py | 43 +++ 5 files changed, 1513 insertions(+), 258 deletions(-) (limited to 'ot/gromov') diff --git a/ot/gromov/__init__.py b/ot/gromov/__init__.py index 6184edf..e39d906 100644 --- a/ot/gromov/__init__.py +++ b/ot/gromov/__init__.py @@ -11,38 +11,51 @@ Solvers related to Gromov-Wasserstein problems. # All submodules and packages from ._utils import (init_matrix, tensor_product, gwloss, gwggrad, - update_square_loss, update_kl_loss, + update_square_loss, update_kl_loss, update_feature_matrix, init_matrix_semirelaxed) + from ._gw import (gromov_wasserstein, gromov_wasserstein2, fused_gromov_wasserstein, fused_gromov_wasserstein2, - solve_gromov_linesearch, gromov_barycenters, fgw_barycenters, - update_structure_matrix, update_feature_matrix) + solve_gromov_linesearch, gromov_barycenters, fgw_barycenters) + from ._bregman import (entropic_gromov_wasserstein, entropic_gromov_wasserstein2, - entropic_gromov_barycenters) + entropic_gromov_barycenters, + entropic_fused_gromov_wasserstein, + entropic_fused_gromov_wasserstein2, + entropic_fused_gromov_barycenters) + from ._estimators import (GW_distance_estimation, pointwise_gromov_wasserstein, sampled_gromov_wasserstein) + from ._semirelaxed import (semirelaxed_gromov_wasserstein, semirelaxed_gromov_wasserstein2, semirelaxed_fused_gromov_wasserstein, semirelaxed_fused_gromov_wasserstein2, - solve_semirelaxed_gromov_linesearch) + solve_semirelaxed_gromov_linesearch, + entropic_semirelaxed_gromov_wasserstein, + entropic_semirelaxed_gromov_wasserstein2, + entropic_semirelaxed_fused_gromov_wasserstein, + entropic_semirelaxed_fused_gromov_wasserstein2) + from ._dictionary import (gromov_wasserstein_dictionary_learning, gromov_wasserstein_linear_unmixing, fused_gromov_wasserstein_dictionary_learning, fused_gromov_wasserstein_linear_unmixing) -__all__ = ['init_matrix', 'tensor_product', 'gwloss', 'gwggrad', - 'update_square_loss', 'update_kl_loss', 'init_matrix_semirelaxed', +__all__ = ['init_matrix', 'tensor_product', 'gwloss', 'gwggrad', 'update_square_loss', + 'update_kl_loss', 'update_feature_matrix', 'init_matrix_semirelaxed', 'gromov_wasserstein', 'gromov_wasserstein2', 'fused_gromov_wasserstein', 'fused_gromov_wasserstein2', 'solve_gromov_linesearch', 'gromov_barycenters', - 'fgw_barycenters', 'update_structure_matrix', 'update_feature_matrix', - 'entropic_gromov_wasserstein', 'entropic_gromov_wasserstein2', - 'entropic_gromov_barycenters', 'GW_distance_estimation', - 'pointwise_gromov_wasserstein', 'sampled_gromov_wasserstein', + 'fgw_barycenters', 'entropic_gromov_wasserstein', 'entropic_gromov_wasserstein2', + 'entropic_gromov_barycenters', 'entropic_fused_gromov_wasserstein', + 'entropic_fused_gromov_wasserstein2', 'entropic_fused_gromov_barycenters', + 'GW_distance_estimation', 'pointwise_gromov_wasserstein', 'sampled_gromov_wasserstein', 'semirelaxed_gromov_wasserstein', 'semirelaxed_gromov_wasserstein2', 'semirelaxed_fused_gromov_wasserstein', 'semirelaxed_fused_gromov_wasserstein2', - 'solve_semirelaxed_gromov_linesearch', 'gromov_wasserstein_dictionary_learning', + 'solve_semirelaxed_gromov_linesearch', 'entropic_semirelaxed_gromov_wasserstein', + 'entropic_semirelaxed_gromov_wasserstein2', 'entropic_semirelaxed_fused_gromov_wasserstein', + 'entropic_semirelaxed_fused_gromov_wasserstein2', 'gromov_wasserstein_dictionary_learning', 'gromov_wasserstein_linear_unmixing', 'fused_gromov_wasserstein_dictionary_learning', 'fused_gromov_wasserstein_linear_unmixing'] diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py index aa25f1f..18cef56 100644 --- a/ot/gromov/_bregman.py +++ b/ot/gromov/_bregman.py @@ -11,23 +11,29 @@ Bregman projections solvers for entropic Gromov-Wasserstein # # License: MIT License +import numpy as np +import warnings + from ..bregman import sinkhorn -from ..utils import dist, list_to_array, check_random_state +from ..utils import dist, list_to_array, check_random_state, unif from ..backend import get_backend from ._utils import init_matrix, gwloss, gwggrad -from ._utils import update_square_loss, update_kl_loss +from ._utils import update_square_loss, update_kl_loss, update_feature_matrix -def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, G0=None, - max_iter=1000, tol=1e-9, verbose=False, log=False): +def entropic_gromov_wasserstein( + C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000, + tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs): r""" - Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` + Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` + estimated using Sinkhorn projections. - The function solves the following optimization problem: + If `solver="PGD"`, the function solves the following entropic-regularized + Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]: .. math:: - \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) + \mathbf{T}^* \in \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon H(\mathbf{T}) s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} @@ -35,6 +41,17 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, \mathbf{T} &\geq 0 + Else if `solver="PPA"`, the function solves the following Gromov-Wasserstein + optimization problem using Proximal Point Algorithm [51]: + + .. math:: + \mathbf{T}^* \in \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space @@ -58,13 +75,15 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) - Distribution in the source space - q : array-like, shape (nt,) - Distribution in the target space - loss_fun : string + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. + loss_fun : string, optional Loss function used for the solver either 'square_loss' or 'kl_loss' - epsilon : float + epsilon : float, optional Regularization term >0 symmetric : bool, optional Either C1 and C2 are to be assumed symmetric or not. @@ -72,16 +91,28 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric). G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. - Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + Otherwise G0 will be used as initial transport of the solver. G0 is not + required to satisfy marginal constraints but we strongly recommand it + to correcly estimate the GW distance. max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) + solver: string, optional + Solver to use either 'PGD' for Projected Gradient Descent or 'PPA' + for Proximal Point Algorithm. + Default value is 'PGD'. + warmstart: bool, optional + Either to perform warmstart of dual potentials in the successive + Sinkhorn projections. verbose : bool, optional Print information along iterations log : bool, optional Record log if True. - + **kwargs: dict + parameters can be directly passed to the ot.sinkhorn solver. + Such as `numItermax` and `stopThr` to control its estimation precision, + e.g [51] suggests to use `numItermax=1`. Returns ------- T : array-like, shape (`ns`, `nt`) @@ -96,22 +127,50 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, .. [47] Chowdhury, S., & Mémoli, F. (2019). The gromov–wasserstein distance between networks and stable network invariants. Information and Inference: A Journal of the IMA, 8(4), 757-787. + + .. [51] Xu, H., Luo, D., Zha, H., & Duke, L. C. (2019). Gromov-wasserstein + learning for graph matching and node embedding. In International + Conference on Machine Learning (ICML), 2019. """ - C1, C2, p, q = list_to_array(C1, C2, p, q) + if solver not in ['PGD', 'PPA']: + raise ValueError("Unknown solver '%s'. Pick one in ['PGD', 'PPA']." % solver) + + C1, C2 = list_to_array(C1, C2) + arr = [C1, C2] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + if q is not None: + arr.append(list_to_array(q)) + else: + q = unif(C2.shape[0], type_as=C2) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) + if G0 is None: - nx = get_backend(p, q, C1, C2) G0 = nx.outer(p, q) - else: - nx = get_backend(p, q, C1, C2, G0) + T = G0 constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun, nx) + if symmetric is None: symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) if not symmetric: constCt, hC1t, hC2t = init_matrix(C1.T, C2.T, p, q, loss_fun, nx) + cpt = 0 err = 1 + if warmstart: + # initialize potentials to cope with ot.sinkhorn initialization + N1, N2 = C1.shape[0], C2.shape[0] + mu = nx.zeros(N1, type_as=C1) - np.log(N1) + nu = nx.zeros(N2, type_as=C2) - np.log(N2) + if log: log = {'err': []} @@ -124,7 +183,17 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, tens = gwggrad(constC, hC1, hC2, T, nx) else: tens = 0.5 * (gwggrad(constC, hC1, hC2, T, nx) + gwggrad(constCt, hC1t, hC2t, T, nx)) - T = sinkhorn(p, q, tens, epsilon, method='sinkhorn') + + if solver == 'PPA': + tens = tens - epsilon * nx.log(T) + + if warmstart: + T, loginn = sinkhorn(p, q, tens, epsilon, method='sinkhorn', log=True, warmstart=(mu, nu), **kwargs) + mu = epsilon * nx.log(loginn['u']) + nu = epsilon * nx.log(loginn['v']) + + else: + T = sinkhorn(p, q, tens, epsilon, method='sinkhorn', **kwargs) if cpt % 10 == 0: # we can speed up the process by checking for the error only all @@ -142,6 +211,9 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, cpt += 1 + if abs(nx.sum(T) - 1) > 1e-5: + warnings.warn("Solver failed to produce a transport plan. You might " + "want to increase the regularization parameter `epsilon`.") if log: log['gw_dist'] = gwloss(constC, hC1, hC2, T, nx) return T, log @@ -149,17 +221,36 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, symmetric=None, return T -def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, symmetric=None, G0=None, - max_iter=1000, tol=1e-9, verbose=False, log=False): +def entropic_gromov_wasserstein2( + C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000, + tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs): r""" - Returns the entropic Gromov-Wasserstein discrepancy between the two measured similarity matrices :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` + Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` + estimated using Sinkhorn projections. - The function solves the following optimization problem: + If `solver="PGD"`, the function solves the following entropic-regularized + Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]: + + .. math:: + \mathbf{GW} = \mathop{\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon H(\mathbf{T}) + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 + + Else if `solver="PPA"`, the function solves the following Gromov-Wasserstein + optimization problem using Proximal Point Algorithm [51]: .. math:: - GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) - \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) + \mathbf{GW} = \mathop{\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 Where : - :math:`\mathbf{C_1}`: Metric cost matrix in the source space @@ -183,13 +274,15 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, symmetric=None Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) - Distribution in the source space - q : array-like, shape (nt,) - Distribution in the target space - loss_fun : str + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. + loss_fun : string, optional Loss function used for the solver either 'square_loss' or 'kl_loss' - epsilon : float + epsilon : float, optional Regularization term >0 symmetric : bool, optional Either C1 and C2 are to be assumed symmetric or not. @@ -197,16 +290,28 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, symmetric=None Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric). G0: array-like, shape (ns,nt), optional If None the initial transport plan of the solver is pq^T. - Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + Otherwise G0 will be used as initial transport of the solver. G0 is not + required to satisfy marginal constraints but we strongly recommand it + to correcly estimate the GW distance. max_iter : int, optional Max number of iterations tol : float, optional Stop threshold on error (>0) + solver: string, optional + Solver to use either 'PGD' for Projected Gradient Descent or 'PPA' + for Proximal Point Algorithm. + Default value is 'PGD'. + warmstart: bool, optional + Either to perform warmstart of dual potentials in the successive + Sinkhorn projections. verbose : bool, optional Print information along iterations log : bool, optional Record log if True. - + **kwargs: dict + parameters can be directly passed to the ot.sinkhorn solver. + Such as `numItermax` and `stopThr` to control its estimation precision, + e.g [51] suggests to use `numItermax=1`. Returns ------- gw_dist : float @@ -218,11 +323,15 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, symmetric=None "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. + .. [51] Xu, H., Luo, D., Zha, H., & Duke, L. C. (2019). Gromov-wasserstein + learning for graph matching and node embedding. In International + Conference on Machine Learning (ICML), 2019. """ - gw, logv = entropic_gromov_wasserstein( - C1, C2, p, q, loss_fun, epsilon, symmetric, G0, max_iter, tol, verbose, log=True) + T, logv = entropic_gromov_wasserstein( + C1, C2, p, q, loss_fun, epsilon, symmetric, G0, max_iter, + tol, solver, warmstart, verbose, log=True, **kwargs) - logv['T'] = gw + logv['T'] = T if log: return logv['gw_dist'], logv @@ -230,10 +339,13 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, symmetric=None return logv['gw_dist'] -def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmetric=True, - max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None): +def entropic_gromov_barycenters( + N, Cs, ps=None, p=None, lambdas=None, loss_fun='square_loss', + epsilon=0.1, symmetric=True, max_iter=1000, tol=1e-9, warmstartT=False, + verbose=False, log=False, init_C=None, random_state=None, **kwargs): r""" - Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}` + Returns the Gromov-Wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}` + estimated using Gromov-Wasserstein transports from Sinkhorn projections. The function solves the following optimization problem: @@ -252,19 +364,18 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet Size of the targeted barycenter Cs : list of S array-like of shape (ns,ns) Metric cost matrices - ps : list of S array-like of shape (ns,) - Sample weights in the `S` spaces - p : array-like, shape(N,) - Weights in the targeted barycenter - lambdas : list of float + ps : list of S array-like of shape (ns,), optional + Sample weights in the `S` spaces. + If let to its default value None, uniform distributions are taken. + p : array-like, shape (N,), optional + Weights in the targeted barycenter. + If let to its default value None, uniform distribution is taken. + lambdas : list of float, optional List of the `S` spaces' weights. - loss_fun : callable - Tensor-matrix multiplication function based on specific loss function. - update : callable - function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates - :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings - calculated at each iteration - epsilon : float + If let to its default value None, uniform weights are taken. + loss_fun : callable, optional + tensor-matrix multiplication function based on specific loss function + epsilon : float, optional Regularization term >0 symmetric : bool, optional. Either structures are to be assumed symmetric or not. Default value is True. @@ -273,6 +384,9 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet Max number of iterations tol : float, optional Stop threshold on error (>0) + warmstartT: bool, optional + Either to perform warmstart of transport plans in the successive + gromov-wasserstein transport problems. verbose : bool, optional Print information along iterations. log : bool, optional @@ -281,6 +395,8 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet Random initial value for the :math:`\mathbf{C}` matrix provided by user. random_state : int or RandomState instance, optional Fix the seed for reproducibility + **kwargs: dict + parameters can be directly passed to the `ot.entropic_gromov_wasserstein` solver. Returns ------- @@ -296,11 +412,21 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet International Conference on Machine Learning (ICML). 2016. """ Cs = list_to_array(*Cs) - ps = list_to_array(*ps) - p = list_to_array(p) - nx = get_backend(*Cs, *ps, p) + arr = [*Cs] + if ps is not None: + arr += list_to_array(*ps) + else: + ps = [unif(C.shape[0], type_as=C) for C in Cs] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(N, type_as=Cs[0]) + + nx = get_backend(*arr) S = len(Cs) + if lambdas is None: + lambdas = [1. / S] * S # Initialization of C : random SPD matrix (if not provided by user) if init_C is None: @@ -317,11 +443,20 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet error = [] + if warmstartT: + T = [None] * S + while (err > tol) and (cpt < max_iter): Cprev = C + if warmstartT: + T = [entropic_gromov_wasserstein( + Cs[s], C, ps[s], p, loss_fun, epsilon, symmetric, T[s], + max_iter, 1e-4, verbose=verbose, log=False, **kwargs) for s in range(S)] + else: + T = [entropic_gromov_wasserstein( + Cs[s], C, ps[s], p, loss_fun, epsilon, symmetric, None, + max_iter, 1e-4, verbose=verbose, log=False, **kwargs) for s in range(S)] - T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon, symmetric, None, - max_iter, 1e-4, verbose, log=False) for s in range(S)] if loss_fun == 'square_loss': C = update_square_loss(p, lambdas, T, Cs) @@ -346,3 +481,536 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, symmet return C, {"err": error} else: return C + + +def entropic_fused_gromov_wasserstein( + M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, + symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9, + solver='PGD', warmstart=False, verbose=False, log=False, **kwargs): + r""" + Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})` + with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`, + estimated using Sinkhorn projections. + + If `solver="PGD"`, the function solves the following entropic-regularized + Fused Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]: + + .. math:: + \mathbf{T}^* \in \mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon H(\mathbf{T}) + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 + + Else if `solver="PPA"`, the function solves the following Fused Gromov-Wasserstein + optimization problem using Proximal Point Algorithm [51]: + + .. math:: + \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 + Where : + + - :math:`\mathbf{M}`: metric cost matrix between features across domains + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}`: distribution in the source space + - :math:`\mathbf{q}`: distribution in the target space + - `L`: loss function to account for the misfit between the similarity and feature matrices + - `H`: entropy + - :math:`\alpha`: trade-off parameter + + .. note:: If the inner solver `ot.sinkhorn` did not convergence, the + optimal coupling :math:`\mathbf{T}` returned by this function does not + necessarily satisfy the marginal constraints + :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and + :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned + Fused Gromov-Wasserstein loss does not necessarily satisfy distance + properties and may be negative. + + Parameters + ---------- + M : array-like, shape (ns, nt) + Metric cost matrix between features across domains + C1 : array-like, shape (ns, ns) + Metric cost matrix in the source space + C2 : array-like, shape (nt, nt) + Metric cost matrix in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. + loss_fun : string, optional + Loss function used for the solver either 'square_loss' or 'kl_loss' + epsilon : float, optional + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 will be used as initial transport of the solver. G0 is not + required to satisfy marginal constraints but we strongly recommand it + to correcly estimate the GW distance. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error (>0) + solver: string, optional + Solver to use either 'PGD' for Projected Gradient Descent or 'PPA' + for Proximal Point Algorithm. + Default value is 'PGD'. + warmstart: bool, optional + Either to perform warmstart of dual potentials in the successive + Sinkhorn projections. + verbose : bool, optional + Print information along iterations + log : bool, optional + Record log if True. + **kwargs: dict + parameters can be directly passed to the ot.sinkhorn solver. + Such as `numItermax` and `stopThr` to control its estimation precision, + e.g [51] suggests to use `numItermax=1`. + Returns + ------- + T : array-like, shape (`ns`, `nt`) + Optimal coupling between the two joint spaces + + References + ---------- + .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + + .. [47] Chowdhury, S., & Mémoli, F. (2019). The gromov–wasserstein + distance between networks and stable network invariants. + Information and Inference: A Journal of the IMA, 8(4), 757-787. + + .. [51] Xu, H., Luo, D., Zha, H., & Duke, L. C. (2019). Gromov-wasserstein + learning for graph matching and node embedding. In International + Conference on Machine Learning (ICML), 2019. + + .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain + and Courty Nicolas "Optimal Transport for structured data with + application on graphs", International Conference on Machine Learning + (ICML). 2019. + """ + if solver not in ['PGD', 'PPA']: + raise ValueError("Unknown solver '%s'. Pick one in ['PGD', 'PPA']." % solver) + + M, C1, C2 = list_to_array(M, C1, C2) + arr = [M, C1, C2] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + if q is not None: + arr.append(list_to_array(q)) + else: + q = unif(C2.shape[0], type_as=C2) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) + + if G0 is None: + G0 = nx.outer(p, q) + + T = G0 + constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun, nx) + if symmetric is None: + symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) + if not symmetric: + constCt, hC1t, hC2t = init_matrix(C1.T, C2.T, p, q, loss_fun, nx) + cpt = 0 + err = 1 + + if warmstart: + # initialize potentials to cope with ot.sinkhorn initialization + N1, N2 = C1.shape[0], C2.shape[0] + mu = nx.zeros(N1, type_as=C1) - np.log(N1) + nu = nx.zeros(N2, type_as=C2) - np.log(N2) + + if log: + log = {'err': []} + + while (err > tol and cpt < max_iter): + + Tprev = T + + # compute the gradient + if symmetric: + tens = alpha * gwggrad(constC, hC1, hC2, T, nx) + (1 - alpha) * M + else: + tens = (alpha * 0.5) * (gwggrad(constC, hC1, hC2, T, nx) + gwggrad(constCt, hC1t, hC2t, T, nx)) + (1 - alpha) * M + + if solver == 'PPA': + tens = tens - epsilon * nx.log(T) + + if warmstart: + T, loginn = sinkhorn(p, q, tens, epsilon, method='sinkhorn', log=True, warmstart=(mu, nu), **kwargs) + mu = epsilon * nx.log(loginn['u']) + nu = epsilon * nx.log(loginn['v']) + + else: + T = sinkhorn(p, q, tens, epsilon, method='sinkhorn', **kwargs) + + if cpt % 10 == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + err = nx.norm(T - Tprev) + + if log: + log['err'].append(err) + + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}'.format( + 'It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err)) + + cpt += 1 + + if abs(nx.sum(T) - 1) > 1e-5: + warnings.warn("Solver failed to produce a transport plan. You might " + "want to increase the regularization parameter `epsilon`.") + if log: + log['fgw_dist'] = (1 - alpha) * nx.sum(M * T) + alpha * gwloss(constC, hC1, hC2, T, nx) + return T, log + else: + return T + + +def entropic_fused_gromov_wasserstein2( + M, C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, + symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9, + solver='PGD', warmstart=False, verbose=False, log=False, **kwargs): + r""" + Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})` + with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`, + estimated using Sinkhorn projections. + + If `solver="PGD"`, the function solves the following entropic-regularized + Fused Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]: + + .. math:: + \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon H(\mathbf{T}) + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 + + Else if `solver="PPA"`, the function solves the following Fused Gromov-Wasserstein + optimization problem using Proximal Point Algorithm [51]: + + .. math:: + \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T}^T \mathbf{1} &= \mathbf{q} + + \mathbf{T} &\geq 0 + Where : + + - :math:`\mathbf{M}`: metric cost matrix between features across domains + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}`: distribution in the source space + - :math:`\mathbf{q}`: distribution in the target space + - `L`: loss function to account for the misfit between the similarity and feature matrices + - `H`: entropy + - :math:`\alpha`: trade-off parameter + + .. note:: If the inner solver `ot.sinkhorn` did not convergence, the + optimal coupling :math:`\mathbf{T}` returned by this function does not + necessarily satisfy the marginal constraints + :math:`\mathbf{T}\mathbf{1}=\mathbf{p}` and + :math:`\mathbf{T}^T\mathbf{1}=\mathbf{q}`. So the returned + Fused Gromov-Wasserstein loss does not necessarily satisfy distance + properties and may be negative. + + Parameters + ---------- + M : array-like, shape (ns, nt) + Metric cost matrix between features across domains + C1 : array-like, shape (ns, ns) + Metric cost matrix in the source space + C2 : array-like, shape (nt, nt) + Metric cost matrix in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. + loss_fun : string, optional + Loss function used for the solver either 'square_loss' or 'kl_loss' + epsilon : float, optional + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 will be used as initial transport of the solver. G0 is not + required to satisfy marginal constraints but we strongly recommand it + to correcly estimate the GW distance. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error (>0) + verbose : bool, optional + Print information along iterations + log : bool, optional + Record log if True. + + Returns + ------- + fgw_dist : float + Fused Gromov-Wasserstein distance + + References + ---------- + .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + + .. [51] Xu, H., Luo, D., Zha, H., & Duke, L. C. (2019). Gromov-wasserstein + learning for graph matching and node embedding. In International + Conference on Machine Learning (ICML), 2019. + + .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain + and Courty Nicolas "Optimal Transport for structured data with + application on graphs", International Conference on Machine Learning + (ICML). 2019. + + """ + T, logv = entropic_fused_gromov_wasserstein( + M, C1, C2, p, q, loss_fun, epsilon, symmetric, alpha, G0, max_iter, + tol, solver, warmstart, verbose, log=True, **kwargs) + + logv['T'] = T + + if log: + return logv['fgw_dist'], logv + else: + return logv['fgw_dist'] + + +def entropic_fused_gromov_barycenters( + N, Ys, Cs, ps=None, p=None, lambdas=None, loss_fun='square_loss', + epsilon=0.1, symmetric=True, alpha=0.5, max_iter=1000, tol=1e-9, + warmstartT=False, verbose=False, log=False, init_C=None, init_Y=None, + random_state=None, **kwargs): + r""" + Returns the Fused Gromov-Wasserstein barycenters of `S` measurable networks with node features :math:`(\mathbf{C}_s, \mathbf{Y}_s, \mathbf{p}_s)_{1 \leq s \leq S}` + estimated using Fused Gromov-Wasserstein transports from Sinkhorn projections. + + The function solves the following optimization problem: + + .. math:: + + \mathbf{C}, \mathbf{Y} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s) + + Where : + + - :math:`\mathbf{Y}_s`: feature matrix + - :math:`\mathbf{C}_s`: metric cost matrix + - :math:`\mathbf{p}_s`: distribution + + Parameters + ---------- + N : int + Size of the targeted barycenter + Ys: list of array-like, each element has shape (ns,d) + Features of all samples + Cs : list of S array-like of shape (ns,ns) + Metric cost matrices + ps : list of S array-like of shape (ns,), optional + Sample weights in the `S` spaces. + If let to its default value None, uniform distributions are taken. + p : array-like, shape (N,), optional + Weights in the targeted barycenter. + If let to its default value None, uniform distribution is taken. + lambdas : list of float, optional + List of the `S` spaces' weights. + If let to its default value None, uniform weights are taken. + loss_fun : callable, optional + tensor-matrix multiplication function based on specific loss function + epsilon : float, optional + Regularization term >0 + symmetric : bool, optional. + Either structures are to be assumed symmetric or not. Default value is True. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymmetric). + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error (>0) + warmstartT: bool, optional + Either to perform warmstart of transport plans in the successive + fused gromov-wasserstein transport problems. + verbose : bool, optional + Print information along iterations. + log : bool, optional + Record log if True. + init_C : bool | array-like, shape (N, N) + Random initial value for the :math:`\mathbf{C}` matrix provided by user. + init_Y : array-like, shape (N,d), optional + Initialization for the barycenters' features. If not set a + random init is used. + random_state : int or RandomState instance, optional + Fix the seed for reproducibility + **kwargs: dict + parameters can be directly passed to the `ot.entropic_fused_gromov_wasserstein` solver. + + Returns + ------- + Y : array-like, shape (`N`, `d`) + Feature matrix in the barycenter space (permutated arbitrarily) + C : array-like, shape (`N`, `N`) + Similarity matrix in the barycenter space (permutated as Y's rows) + log : dict + Log dictionary of error during iterations. Return only if `log=True` in parameters. + + References + ---------- + .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + + .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain + and Courty Nicolas + "Optimal Transport for structured data with application on graphs" + International Conference on Machine Learning (ICML). 2019. + """ + Cs = list_to_array(*Cs) + Ys = list_to_array(*Ys) + arr = [*Cs, *Ys] + if ps is not None: + arr += list_to_array(*ps) + else: + ps = [unif(C.shape[0], type_as=C) for C in Cs] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(N, type_as=Cs[0]) + + nx = get_backend(*arr) + S = len(Cs) + if lambdas is None: + lambdas = [1. / S] * S + + d = Ys[0].shape[1] # dimension on the node features + + # Initialization of C : random SPD matrix (if not provided by user) + if init_C is None: + generator = check_random_state(random_state) + xalea = generator.randn(N, 2) + C = dist(xalea, xalea) + C /= C.max() + C = nx.from_numpy(C, type_as=p) + else: + C = init_C + + # Initialization of Y + if init_Y is None: + Y = nx.zeros((N, d), type_as=ps[0]) + else: + Y = init_Y + + T = [nx.outer(p_, p) for p_ in ps] + + Ms = [dist(Ys[s], Y) for s in range(len(Ys))] + + cpt = 0 + err = 1 + + err_feature = 1 + err_structure = 1 + + if warmstartT: + T = [None] * S + + if log: + log_ = {} + log_['err_feature'] = [] + log_['err_structure'] = [] + log_['Ts_iter'] = [] + + while (err > tol) and (cpt < max_iter): + Cprev = C + Yprev = Y + + if warmstartT: + T = [entropic_fused_gromov_wasserstein( + Ms[s], Cs[s], C, ps[s], p, loss_fun, epsilon, symmetric, alpha, + None, max_iter, 1e-4, verbose=verbose, log=False, **kwargs) for s in range(S)] + + else: + T = [entropic_fused_gromov_wasserstein( + Ms[s], Cs[s], C, ps[s], p, loss_fun, epsilon, symmetric, alpha, + None, max_iter, 1e-4, verbose=verbose, log=False, **kwargs) for s in range(S)] + + if loss_fun == 'square_loss': + C = update_square_loss(p, lambdas, T, Cs) + + elif loss_fun == 'kl_loss': + C = update_kl_loss(p, lambdas, T, Cs) + + Ys_temp = [y.T for y in Ys] + T_temp = [Ts.T for Ts in T] + Y = update_feature_matrix(lambdas, Ys_temp, T_temp, p) + Ms = [dist(Ys[s], Y) for s in range(len(Ys))] + + if cpt % 10 == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + err_feature = nx.norm(Y - nx.reshape(Yprev, (N, d))) + err_structure = nx.norm(C - Cprev) + if log: + log_['err_feature'].append(err_feature) + log_['err_structure'].append(err_structure) + log_['Ts_iter'].append(T) + + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}'.format( + 'It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err_structure)) + print('{:5d}|{:8e}|'.format(cpt, err_feature)) + + cpt += 1 + print('Y type:', type(Y)) + if log: + log_['T'] = T # from target to Ys + log_['p'] = p + log_['Ms'] = Ms + + if log: + return Y, C, log_ + else: + return Y, C diff --git a/ot/gromov/_gw.py b/ot/gromov/_gw.py index cdfa9a3..adf6b82 100644 --- a/ot/gromov/_gw.py +++ b/ot/gromov/_gw.py @@ -16,14 +16,14 @@ import numpy as np from ..utils import dist, UndefinedParameter, list_to_array from ..optim import cg, line_search_armijo, solve_1d_linesearch_quad -from ..utils import check_random_state +from ..utils import check_random_state, unif from ..backend import get_backend, NumpyBackend from ._utils import init_matrix, gwloss, gwggrad -from ._utils import update_square_loss, update_kl_loss +from ._utils import update_square_loss, update_kl_loss, update_feature_matrix -def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None, +def gromov_wasserstein(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` @@ -31,7 +31,7 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log The function solves the following optimization problem: .. math:: - \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} @@ -60,11 +60,13 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) - Distribution in the source space - q : array-like, shape (nt,) - Distribution in the target space - loss_fun : str + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. + loss_fun : str, optional loss function used for the solver either 'square_loss' or 'kl_loss' symmetric : bool, optional Either C1 and C2 are to be assumed symmetric or not. @@ -112,15 +114,24 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log distance between networks and stable network invariants. Information and Inference: A Journal of the IMA, 8(4), 757-787. """ - p, q = list_to_array(p, q) - p0, q0, C10, C20 = p, q, C1, C2 - if G0 is None: - nx = get_backend(p0, q0, C10, C20) + arr = [C1, C2] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + if q is not None: + arr.append(list_to_array(q)) else: + q = unif(C2.shape[0], type_as=C2) + if G0 is not None: G0_ = G0 - nx = get_backend(p0, q0, C10, C20, G0_) - p = nx.to_numpy(p) - q = nx.to_numpy(q) + arr.append(G0) + + nx = get_backend(*arr) + p0, q0, C10, C20 = p, q, C1, C2 + + p = nx.to_numpy(p0) + q = nx.to_numpy(q0) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) if symmetric is None: @@ -168,7 +179,7 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log return nx.from_numpy(cg(p, q, 0., 1., f, df, G0, line_search, log=False, numItermax=max_iter, stopThr=tol_rel, stopThr2=tol_abs, **kwargs), type_as=C10) -def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None, +def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` @@ -176,7 +187,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', symmetric=None, lo The function solves the following optimization problem: .. math:: - GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} + \mathbf{GW} = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} @@ -209,10 +220,12 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', symmetric=None, lo Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) + p : array-like, shape (ns,), optional Distribution in the source space. - q : array-like, shape (nt,) + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional Distribution in the target space. + If let to its default value None, uniform distribution is taken. loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss' symmetric : bool, optional @@ -266,6 +279,12 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', symmetric=None, lo # simple get_backend as the full one will be handled in gromov_wasserstein nx = get_backend(C1, C2) + # init marginals if set as None + if p is None: + p = unif(C1.shape[0], type_as=C1) + if q is None: + q = unif(C2.shape[0], type_as=C2) + T, log_gw = gromov_wasserstein( C1, C2, p, q, loss_fun, symmetric, log=True, armijo=armijo, G0=G0, max_iter=max_iter, tol_rel=tol_rel, tol_abs=tol_abs, **kwargs) @@ -286,20 +305,20 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', symmetric=None, lo return gw -def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', symmetric=None, alpha=0.5, +def fused_gromov_wasserstein(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5, armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Computes the FGW transport between two graphs (see :ref:`[24] `) .. math:: - \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + + \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} - \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q} + \mathbf{T}^T \mathbf{1} &= \mathbf{q} - \mathbf{\gamma} &\geq 0 + \mathbf{T} &\geq 0 where : @@ -323,10 +342,12 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', symmetric= Metric cost matrix representative of the structure in the source space C2 : array-like, shape (nt, nt) Metric cost matrix representative of the structure in the target space - p : array-like, shape (ns,) - Distribution in the source space - q : array-like, shape (nt,) - Distribution in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional + Distribution in the target space. + If let to its default value None, uniform distribution is taken. loss_fun : str, optional Loss function used for the solver symmetric : bool, optional @@ -354,7 +375,7 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', symmetric= Returns ------- - gamma : array-like, shape (`ns`, `nt`) + T : array-like, shape (`ns`, `nt`) Optimal transportation matrix for the given parameters. log : dict Log dictionary return only if log==True in parameters. @@ -372,16 +393,24 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', symmetric= distance between networks and stable network invariants. Information and Inference: A Journal of the IMA, 8(4), 757-787. """ - p, q = list_to_array(p, q) - p0, q0, C10, C20, M0, alpha0 = p, q, C1, C2, M, alpha - if G0 is None: - nx = get_backend(p0, q0, C10, C20, M0) + arr = [C1, C2, M] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + if q is not None: + arr.append(list_to_array(q)) else: + q = unif(C2.shape[0], type_as=C2) + if G0 is not None: G0_ = G0 - nx = get_backend(p0, q0, C10, C20, M0, G0_) + arr.append(G0) - p = nx.to_numpy(p) - q = nx.to_numpy(q) + nx = get_backend(*arr) + p0, q0, C10, C20, M0, alpha0 = p, q, C1, C2, M, alpha + + p = nx.to_numpy(p0) + q = nx.to_numpy(q0) C1 = nx.to_numpy(C10) C2 = nx.to_numpy(C20) M = nx.to_numpy(M0) @@ -433,20 +462,20 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', symmetric= return nx.from_numpy(cg(p, q, (1 - alpha) * M, alpha, f, df, G0, line_search, log=False, numItermax=max_iter, stopThr=tol_rel, stopThr2=tol_abs, **kwargs), type_as=C10) -def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', symmetric=None, alpha=0.5, +def fused_gromov_wasserstein2(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5, armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Computes the FGW distance between two graphs see (see :ref:`[24] `) .. math:: - \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} + \mathbf{GW} = \min_\mathbf{T} \quad (1 - \alpha) \langle \mathbf(T), \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf(T)\mathbf{1} &= \mathbf{p} - \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q} + \mathbf(T)^T \mathbf{1} &= \mathbf{q} - \mathbf{\gamma} &\geq 0 + \mathbf(T) &\geq 0 where : @@ -474,10 +503,12 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', symmetric Metric cost matrix representative of the structure in the source space. C2 : array-like, shape (nt, nt) Metric cost matrix representative of the structure in the target space. - p : array-like, shape (ns,) + p : array-like, shape (ns,), optional Distribution in the source space. - q : array-like, shape (nt,) + If let to its default value None, uniform distribution is taken. + q : array-like, shape (nt,), optional Distribution in the target space. + If let to its default value None, uniform distribution is taken. loss_fun : str, optional Loss function used for the solver. symmetric : bool, optional @@ -529,6 +560,12 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', symmetric """ nx = get_backend(C1, C2, M) + # init marginals if set as None + if p is None: + p = unif(C1.shape[0], type_as=C1) + if q is None: + q = unif(C2.shape[0], type_as=C2) + T, log_fgw = fused_gromov_wasserstein( M, C1, C2, p, q, loss_fun, symmetric, alpha, armijo, G0, log=True, max_iter=max_iter, tol_rel=tol_rel, tol_abs=tol_abs, **kwargs) @@ -626,9 +663,10 @@ def solve_gromov_linesearch(G, deltaG, cost_G, C1, C2, M, reg, return alpha, 1, cost_G -def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=False, - max_iter=1000, tol=1e-9, verbose=False, log=False, - init_C=None, random_state=None, **kwargs): +def gromov_barycenters( + N, Cs, ps=None, p=None, lambdas=None, loss_fun='square_loss', symmetric=True, armijo=False, + max_iter=1000, tol=1e-9, warmstartT=False, verbose=False, log=False, + init_C=None, random_state=None, **kwargs): r""" Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}` @@ -649,13 +687,16 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=F Size of the targeted barycenter Cs : list of S array-like of shape (ns, ns) Metric cost matrices - ps : list of S array-like of shape (ns,) - Sample weights in the `S` spaces - p : array-like, shape (N,) - Weights in the targeted barycenter - lambdas : list of float - List of the `S` spaces' weights - loss_fun : callable + ps : list of S array-like of shape (ns,), optional + Sample weights in the `S` spaces. + If let to its default value None, uniform distributions are taken. + p : array-like, shape (N,), optional + Weights in the targeted barycenter. + If let to its default value None, uniform distribution is taken. + lambdas : list of float, optional + List of the `S` spaces' weights. + If let to its default value None, uniform weights are taken. + loss_fun : callable, optional tensor-matrix multiplication function based on specific loss function symmetric : bool, optional. Either structures are to be assumed symmetric or not. Default value is True. @@ -668,6 +709,9 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=F Max number of iterations tol : float, optional Stop threshold on relative error (>0) + warmstartT: bool, optional + Either to perform warmstart of transport plans in the successive + fused gromov-wasserstein transport problems.s verbose : bool, optional Print information along iterations. log : bool, optional @@ -692,11 +736,21 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=F """ Cs = list_to_array(*Cs) - ps = list_to_array(*ps) - p = list_to_array(p) - nx = get_backend(*Cs, *ps, p) + arr = [*Cs] + if ps is not None: + arr += list_to_array(*ps) + else: + ps = [unif(C.shape[0], type_as=C) for C in Cs] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(N, type_as=Cs[0]) + + nx = get_backend(*arr) S = len(Cs) + if lambdas is None: + lambdas = [1. / S] * S # Initialization of C : random SPD matrix (if not provided by user) if init_C is None: @@ -714,13 +768,19 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=F cpt = 0 err = 1 + if warmstartT: + T = [None] * S error = [] while (err > tol and cpt < max_iter): Cprev = C - T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, symmetric=symmetric, armijo=armijo, - max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, log=False, **kwargs) for s in range(S)] + if warmstartT: + T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, symmetric=symmetric, armijo=armijo, G0=T[s], + max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, log=False, **kwargs) for s in range(S)] + else: + T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, symmetric=symmetric, armijo=armijo, G0=None, + max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, log=False, **kwargs) for s in range(S)] if loss_fun == 'square_loss': C = update_square_loss(p, lambdas, T, Cs) @@ -747,9 +807,11 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, symmetric=True, armijo=F return C -def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False, - p=None, loss_fun='square_loss', armijo=False, symmetric=True, max_iter=100, tol=1e-9, - verbose=False, log=False, init_C=None, init_X=None, random_state=None, **kwargs): +def fgw_barycenters( + N, Ys, Cs, ps=None, lambdas=None, alpha=0.5, fixed_structure=False, + fixed_features=False, p=None, loss_fun='square_loss', armijo=False, + symmetric=True, max_iter=100, tol=1e-9, warmstartT=False, verbose=False, + log=False, init_C=None, init_X=None, random_state=None, **kwargs): r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] ` Parameters @@ -760,16 +822,21 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ Features of all samples Cs : list of array-like, each element has shape (ns,ns) Structure matrices of all samples - ps : list of array-like, each element has shape (ns,) + ps : list of array-like, each element has shape (ns,), optional Masses of all samples. - lambdas : list of float - List of the `S` spaces' weights - alpha : float - Alpha parameter for the fgw distance + If let to its default value None, uniform distributions are taken. + lambdas : list of float, optional + List of the `S` spaces' weights. + If let to its default value None, uniform weights are taken. + alpha : float, optional + Alpha parameter for the fgw distance. fixed_structure : bool Whether to fix the structure of the barycenter during the updates fixed_features : bool Whether to fix the feature of the barycenter during the updates + p : array-like, shape (N,), optional + Weights in the targeted barycenter. + If let to its default value None, uniform distribution is taken. loss_fun : str Loss function used for the solver either 'square_loss' or 'kl_loss' symmetric : bool, optional @@ -779,6 +846,9 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ Max number of iterations tol : float, optional Stop threshold on relative error (>0) + warmstartT: bool, optional + Either to perform warmstart of transport plans in the successive + fused gromov-wasserstein transport problems. verbose : bool, optional Print information along iterations. log : bool, optional @@ -814,15 +884,24 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ International Conference on Machine Learning (ICML). 2019. """ Cs = list_to_array(*Cs) - ps = list_to_array(*ps) Ys = list_to_array(*Ys) - p = list_to_array(p) - nx = get_backend(*Cs, *Ys, *ps) + arr = [*Cs, *Ys] + if ps is not None: + arr += list_to_array(*ps) + else: + ps = [unif(C.shape[0], type_as=C) for C in Cs] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(N, type_as=Cs[0]) + + nx = get_backend(*arr) S = len(Cs) + if lambdas is None: + lambdas = [1. / S] * S + d = Ys[0].shape[1] # dimension on the node features - if p is None: - p = nx.ones(N, type_as=Cs[0]) / N if fixed_structure: if init_C is None: @@ -877,13 +956,21 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ Ms = [dist(X, Ys[s]) for s in range(len(Ys))] if not fixed_structure: + T_temp = [t.T for t in T] if loss_fun == 'square_loss': - T_temp = [t.T for t in T] - C = update_structure_matrix(p, lambdas, T_temp, Cs) + C = update_square_loss(p, lambdas, T_temp, Cs) - T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun=loss_fun, alpha=alpha, armijo=armijo, symmetric=symmetric, - max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, **kwargs) for s in range(S)] + elif loss_fun == 'kl_loss': + C = update_kl_loss(p, lambdas, T_temp, Cs) + if warmstartT: + T = [fused_gromov_wasserstein( + Ms[s], C, Cs[s], p, ps[s], loss_fun=loss_fun, alpha=alpha, armijo=armijo, symmetric=symmetric, + G0=T[s], max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, **kwargs) for s in range(S)] + else: + T = [fused_gromov_wasserstein( + Ms[s], C, Cs[s], p, ps[s], loss_fun=loss_fun, alpha=alpha, armijo=armijo, symmetric=symmetric, + G0=None, max_iter=max_iter, tol_rel=1e-5, tol_abs=0., verbose=verbose, **kwargs) for s in range(S)] # T is N,ns err_feature = nx.norm(X - nx.reshape(Xprev, (N, d))) err_structure = nx.norm(C - Cprev) @@ -910,82 +997,3 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ return X, C, log_ else: return X, C - - -def update_structure_matrix(p, lambdas, T, Cs): - r"""Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings. - - It is calculated at each iteration - - Parameters - ---------- - p : array-like, shape (N,) - Masses in the targeted barycenter. - lambdas : list of float - List of the `S` spaces' weights. - T : list of S array-like of shape (ns, N) - The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration. - Cs : list of S array-like, shape (ns, ns) - Metric cost matrices. - - Returns - ------- - C : array-like, shape (`nt`, `nt`) - Updated :math:`\mathbf{C}` matrix. - """ - p = list_to_array(p) - T = list_to_array(*T) - Cs = list_to_array(*Cs) - nx = get_backend(*Cs, *T, p) - - tmpsum = sum([ - lambdas[s] * nx.dot( - nx.dot(T[s].T, Cs[s]), - T[s] - ) for s in range(len(T)) - ]) - ppt = nx.outer(p, p) - return tmpsum / ppt - - -def update_feature_matrix(lambdas, Ys, Ts, p): - r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings. - - - See "Solving the barycenter problem with Block Coordinate Descent (BCD)" - in :ref:`[24] ` calculated at each iteration - - Parameters - ---------- - p : array-like, shape (N,) - masses in the targeted barycenter - lambdas : list of float - List of the `S` spaces' weights - Ts : list of S array-like, shape (ns,N) - The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration - Ys : list of S array-like, shape (d,ns) - The features. - - Returns - ------- - X : array-like, shape (`d`, `N`) - - - .. _references-update-feature-matrix: - References - ---------- - .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas - "Optimal Transport for structured data with application on graphs" - International Conference on Machine Learning (ICML). 2019. - """ - p = list_to_array(p) - Ts = list_to_array(*Ts) - Ys = list_to_array(*Ys) - nx = get_backend(*Ys, *Ts, p) - - p = 1. / p - tmpsum = sum([ - lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :] - for s in range(len(Ts)) - ]) - return tmpsum diff --git a/ot/gromov/_semirelaxed.py b/ot/gromov/_semirelaxed.py index 94dc975..206329d 100644 --- a/ot/gromov/_semirelaxed.py +++ b/ot/gromov/_semirelaxed.py @@ -18,7 +18,7 @@ from ..backend import get_backend from ._utils import init_matrix_semirelaxed, gwloss, gwggrad -def semirelaxed_gromov_wasserstein(C1, C2, p, loss_fun='square_loss', symmetric=None, log=False, G0=None, +def semirelaxed_gromov_wasserstein(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Returns the semi-relaxed Gromov-Wasserstein divergence transport from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` @@ -26,12 +26,12 @@ def semirelaxed_gromov_wasserstein(C1, C2, p, loss_fun='square_loss', symmetric= The function solves the following optimization problem: .. math:: - \mathbf{srGW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + \mathbf{T}^^* \in \mathop{\arg \min}_{\mathbf{T}} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} - \mathbf{\gamma} &\geq 0 + \mathbf{T} &\geq 0 Where : @@ -51,8 +51,9 @@ def semirelaxed_gromov_wasserstein(C1, C2, p, loss_fun='square_loss', symmetric= Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) - Distribution in the source space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss'. 'kl_loss' is not implemented yet and will raise an error. @@ -93,11 +94,16 @@ def semirelaxed_gromov_wasserstein(C1, C2, p, loss_fun='square_loss', symmetric= """ if loss_fun == 'kl_loss': raise NotImplementedError() - p = list_to_array(p) - if G0 is None: - nx = get_backend(p, C1, C2) + arr = [C1, C2] + if p is not None: + arr.append(list_to_array(p)) else: - nx = get_backend(p, C1, C2, G0) + p = unif(C1.shape[0], type_as=C1) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) if symmetric is None: symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) @@ -143,7 +149,7 @@ def semirelaxed_gromov_wasserstein(C1, C2, p, loss_fun='square_loss', symmetric= return semirelaxed_cg(p, q, 0., 1., f, df, G0, line_search, log=False, numItermax=max_iter, stopThr=tol_rel, stopThr2=tol_abs, **kwargs) -def semirelaxed_gromov_wasserstein2(C1, C2, p, loss_fun='square_loss', symmetric=None, log=False, G0=None, +def semirelaxed_gromov_wasserstein2(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Returns the semi-relaxed gromov-wasserstein divergence from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` @@ -151,12 +157,12 @@ def semirelaxed_gromov_wasserstein2(C1, C2, p, loss_fun='square_loss', symmetric The function solves the following optimization problem: .. math:: - srGW = \min_\mathbf{T} \quad \sum_{i,j,k,l} + \text{srGW} = \min_{\mathbf{T}} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} - \mathbf{\gamma} &\geq 0 + \mathbf{T} &\geq 0 Where : @@ -179,8 +185,9 @@ def semirelaxed_gromov_wasserstein2(C1, C2, p, loss_fun='square_loss', symmetric Metric cost matrix in the source space C2 : array-like, shape (nt, nt) Metric cost matrix in the target space - p : array-like, shape (ns,) + p : array-like, shape (ns,), optional Distribution in the source space. + If let to its default value None, uniform distribution is taken. loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss'. 'kl_loss' is not implemented yet and will raise an error. @@ -218,7 +225,12 @@ def semirelaxed_gromov_wasserstein2(C1, C2, p, loss_fun='square_loss', symmetric "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" International Conference on Learning Representations (ICLR), 2022. """ - nx = get_backend(p, C1, C2) + # partial get_backend as the full one will be handled in gromov_wasserstein + nx = get_backend(C1, C2) + + # init marginals if set as None + if p is None: + p = unif(C1.shape[0], type_as=C1) T, log_srgw = semirelaxed_gromov_wasserstein( C1, C2, p, loss_fun, symmetric, log=True, G0=G0, @@ -239,18 +251,19 @@ def semirelaxed_gromov_wasserstein2(C1, C2, p, loss_fun='square_loss', symmetric return srgw -def semirelaxed_fused_gromov_wasserstein(M, C1, C2, p, loss_fun='square_loss', symmetric=None, alpha=0.5, G0=None, log=False, - max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): +def semirelaxed_fused_gromov_wasserstein( + M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5, + G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Computes the semi-relaxed FGW transport between two graphs (see :ref:`[48] `) .. math:: - \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + - \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) T_{i,j} T_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} - \mathbf{\gamma} &\geq 0 + \mathbf{T} &\geq 0 where : @@ -273,8 +286,9 @@ def semirelaxed_fused_gromov_wasserstein(M, C1, C2, p, loss_fun='square_loss', s Metric cost matrix representative of the structure in the source space C2 : array-like, shape (nt, nt) Metric cost matrix representative of the structure in the target space - p : array-like, shape (ns,) - Distribution in the source space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. loss_fun : str loss function used for the solver either 'square_loss' or 'kl_loss'. 'kl_loss' is not implemented yet and will raise an error. @@ -321,11 +335,16 @@ def semirelaxed_fused_gromov_wasserstein(M, C1, C2, p, loss_fun='square_loss', s if loss_fun == 'kl_loss': raise NotImplementedError() - p = list_to_array(p) - if G0 is None: - nx = get_backend(p, C1, C2, M) + arr = [M, C1, C2] + if p is not None: + arr.append(list_to_array(p)) else: - nx = get_backend(p, C1, C2, M, G0) + p = unif(C1.shape[0], type_as=C1) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) if symmetric is None: symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) @@ -373,18 +392,18 @@ def semirelaxed_fused_gromov_wasserstein(M, C1, C2, p, loss_fun='square_loss', s return semirelaxed_cg(p, q, (1 - alpha) * M, alpha, f, df, G0, line_search, log=False, numItermax=max_iter, stopThr=tol_rel, stopThr2=tol_abs, **kwargs) -def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p, loss_fun='square_loss', symmetric=None, alpha=0.5, G0=None, log=False, +def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs): r""" Computes the semi-relaxed FGW divergence between two graphs (see :ref:`[48] `) .. math:: - \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} - L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) T_{i,j} T_{k,l} - s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} - \mathbf{\gamma} &\geq 0 + \mathbf{T} &\geq 0 where : @@ -412,6 +431,7 @@ def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p, loss_fun='square_loss', Metric cost matrix representative of the structure in the target space. p : array-like, shape (ns,) Distribution in the source space. + If let to its default value None, uniform distribution is taken. loss_fun : str, optional loss function used for the solver either 'square_loss' or 'kl_loss'. 'kl_loss' is not implemented yet and will raise an error. @@ -455,7 +475,12 @@ def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p, loss_fun='square_loss', "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" International Conference on Learning Representations (ICLR), 2022. """ - nx = get_backend(p, C1, C2, M) + # partial get_backend as the full one will be handled in gromov_wasserstein + nx = get_backend(C1, C2) + + # init marginals if set as None + if p is None: + p = unif(C1.shape[0], type_as=C1) T, log_fgw = semirelaxed_fused_gromov_wasserstein( M, C1, C2, p, loss_fun, symmetric, alpha, G0, log=True, @@ -551,3 +576,501 @@ def solve_semirelaxed_gromov_linesearch(G, deltaG, cost_G, C1, C2, ones_p, cost_G = cost_G + a * (alpha ** 2) + b * alpha return alpha, 1, cost_G + + +def entropic_semirelaxed_gromov_wasserstein( + C1, C2, p=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, + G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs): + r""" + Returns the entropic-regularized semi-relaxed gromov-wasserstein divergence + transport plan from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` + estimated using a Mirror Descent algorithm following the KL geometry. + + The function solves the following optimization problem: + + .. math:: + \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T} &\geq 0 + Where : + + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}`: distribution in the source space + + - `L`: loss function to account for the misfit between the similarity matrices + + .. note:: This function is backend-compatible and will work on arrays + from all compatible backends. However all the steps in the conditional + gradient are not differentiable. + + Parameters + ---------- + C1 : array-like, shape (ns, ns) + Metric cost matrix in the source space + C2 : array-like, shape (nt, nt) + Metric cost matrix in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + loss_fun : str + loss function used for the solver either 'square_loss' or 'kl_loss'. + 'kl_loss' is not implemented yet and will raise an error. + epsilon : float + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + verbose : bool, optional + Print information along iterations + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error computed on transport plans + log : bool, optional + record log if True + verbose : bool, optional + Print information along iterations + Returns + ------- + G : array-like, shape (`ns`, `nt`) + Coupling between the two spaces that minimizes: + + :math:`\sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}` + log : dict + Convergence information and loss. + + References + ---------- + .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty. + "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" + International Conference on Learning Representations (ICLR), 2022. + """ + if loss_fun == 'kl_loss': + raise NotImplementedError() + arr = [C1, C2] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) + + if symmetric is None: + symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) + if G0 is None: + q = unif(C2.shape[0], type_as=p) + G0 = nx.outer(p, q) + else: + q = nx.sum(G0, 0) + # Check first marginal of G0 + np.testing.assert_allclose(nx.sum(G0, 1), p, atol=1e-08) + + constC, hC1, hC2, fC2t = init_matrix_semirelaxed(C1, C2, p, loss_fun, nx) + + ones_p = nx.ones(p.shape[0], type_as=p) + + if symmetric: + def df(G): + qG = nx.sum(G, 0) + marginal_product = nx.outer(ones_p, nx.dot(qG, fC2t)) + return gwggrad(constC + marginal_product, hC1, hC2, G, nx) + else: + constCt, hC1t, hC2t, fC2 = init_matrix_semirelaxed(C1.T, C2.T, p, loss_fun, nx) + + def df(G): + qG = nx.sum(G, 0) + marginal_product_1 = nx.outer(ones_p, nx.dot(qG, fC2t)) + marginal_product_2 = nx.outer(ones_p, nx.dot(qG, fC2)) + return 0.5 * (gwggrad(constC + marginal_product_1, hC1, hC2, G, nx) + gwggrad(constCt + marginal_product_2, hC1t, hC2t, G, nx)) + + cpt = 0 + err = 1e15 + G = G0 + + if log: + log = {'err': []} + + while (err > tol and cpt < max_iter): + + Gprev = G + # compute the kernel + K = G * nx.exp(- df(G) / epsilon) + scaling = p / nx.sum(K, 1) + G = nx.reshape(scaling, (-1, 1)) * K + if cpt % 10 == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + err = nx.norm(G - Gprev) + + if log: + log['err'].append(err) + + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}'.format( + 'It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err)) + + cpt += 1 + + if log: + qG = nx.sum(G, 0) + marginal_product = nx.outer(ones_p, nx.dot(qG, fC2t)) + log['srgw_dist'] = gwloss(constC + marginal_product, hC1, hC2, G, nx) + return G, log + else: + return G + + +def entropic_semirelaxed_gromov_wasserstein2( + C1, C2, p=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, + G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs): + r""" + Returns the entropic-regularized semi-relaxed gromov-wasserstein divergence + from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` + estimated using a Mirror Descent algorithm following the KL geometry. + + The function solves the following optimization problem: + + .. math:: + \mathbf{srGW} = \min_{\mathbf{T}} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T} &\geq 0 + Where : + + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}`: distribution in the source space + - `L`: loss function to account for the misfit between the similarity + matrices + + Note that when using backends, this loss function is differentiable wrt the + matrices (C1, C2) but not yet for the weights p. + .. note:: This function is backend-compatible and will work on arrays + from all compatible backends. However all the steps in the conditional + gradient are not differentiable. + + Parameters + ---------- + C1 : array-like, shape (ns, ns) + Metric cost matrix in the source space + C2 : array-like, shape (nt, nt) + Metric cost matrix in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + loss_fun : str + loss function used for the solver either 'square_loss' or 'kl_loss'. + 'kl_loss' is not implemented yet and will raise an error. + epsilon : float + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + verbose : bool, optional + Print information along iterations + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error computed on transport plans + log : bool, optional + record log if True + verbose : bool, optional + Print information along iterations + **kwargs : dict + parameters can be directly passed to the ot.optim.cg solver + + Returns + ------- + srgw : float + Semi-relaxed Gromov-Wasserstein divergence + log : dict + convergence information and Coupling matrix + + References + ---------- + + .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty. + "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" + International Conference on Learning Representations (ICLR), 2022. + """ + T, log_srgw = entropic_semirelaxed_gromov_wasserstein( + C1, C2, p, loss_fun, epsilon, symmetric, G0, + max_iter, tol, log=True, verbose=verbose, **kwargs) + + log_srgw['T'] = T + + if log: + return log_srgw['srgw_dist'], log_srgw + else: + return log_srgw['srgw_dist'] + + +def entropic_semirelaxed_fused_gromov_wasserstein( + M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, epsilon=0.1, + alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs): + r""" + Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] `) + + .. math:: + \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T} &\geq 0 + + where : + + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix between features + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}` source weights (sum to 1) + - `L` is a loss function to account for the misfit between the similarity matrices + + + .. note:: This function is backend-compatible and will work on arrays + from all compatible backends. However all the steps in the conditional + gradient are not differentiable. + + The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[48] ` + + Parameters + ---------- + M : array-like, shape (ns, nt) + Metric cost matrix between features across domains + C1 : array-like, shape (ns, ns) + Metric cost matrix representative of the structure in the source space + C2 : array-like, shape (nt, nt) + Metric cost matrix representative of the structure in the target space + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + loss_fun : str + loss function used for the solver either 'square_loss' or 'kl_loss'. + 'kl_loss' is not implemented yet and will raise an error. + epsilon : float + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error computed on transport plans + log : bool, optional + record log if True + verbose : bool, optional + Print information along iterations + **kwargs : dict + parameters can be directly passed to the ot.optim.cg solver + + Returns + ------- + G : array-like, shape (`ns`, `nt`) + Optimal transportation matrix for the given parameters. + log : dict + Log dictionary return only if log==True in parameters. + + + .. _references-semirelaxed-fused-gromov-wasserstein: + References + ---------- + .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty. + "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" + International Conference on Learning Representations (ICLR), 2022. + """ + if loss_fun == 'kl_loss': + raise NotImplementedError() + arr = [M, C1, C2] + if p is not None: + arr.append(list_to_array(p)) + else: + p = unif(C1.shape[0], type_as=C1) + + if G0 is not None: + arr.append(G0) + + nx = get_backend(*arr) + + if symmetric is None: + symmetric = nx.allclose(C1, C1.T, atol=1e-10) and nx.allclose(C2, C2.T, atol=1e-10) + if G0 is None: + q = unif(C2.shape[0], type_as=p) + G0 = nx.outer(p, q) + else: + q = nx.sum(G0, 0) + # Check first marginal of G0 + np.testing.assert_allclose(nx.sum(G0, 1), p, atol=1e-08) + + constC, hC1, hC2, fC2t = init_matrix_semirelaxed(C1, C2, p, loss_fun, nx) + + ones_p = nx.ones(p.shape[0], type_as=p) + dM = (1 - alpha) * M + if symmetric: + def df(G): + qG = nx.sum(G, 0) + marginal_product = nx.outer(ones_p, nx.dot(qG, fC2t)) + return alpha * gwggrad(constC + marginal_product, hC1, hC2, G, nx) + dM + else: + constCt, hC1t, hC2t, fC2 = init_matrix_semirelaxed(C1.T, C2.T, p, loss_fun, nx) + + def df(G): + qG = nx.sum(G, 0) + marginal_product_1 = nx.outer(ones_p, nx.dot(qG, fC2t)) + marginal_product_2 = nx.outer(ones_p, nx.dot(qG, fC2)) + return 0.5 * alpha * (gwggrad(constC + marginal_product_1, hC1, hC2, G, nx) + gwggrad(constCt + marginal_product_2, hC1t, hC2t, G, nx)) + dM + + cpt = 0 + err = 1e15 + G = G0 + + if log: + log = {'err': []} + + while (err > tol and cpt < max_iter): + + Gprev = G + # compute the kernel + K = G * nx.exp(- df(G) / epsilon) + scaling = p / nx.sum(K, 1) + G = nx.reshape(scaling, (-1, 1)) * K + if cpt % 10 == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + err = nx.norm(G - Gprev) + + if log: + log['err'].append(err) + + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}'.format( + 'It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err)) + + cpt += 1 + + if log: + qG = nx.sum(G, 0) + marginal_product = nx.outer(ones_p, nx.dot(qG, fC2t)) + log['srfgw_dist'] = alpha * gwloss(constC + marginal_product, hC1, hC2, G, nx) + (1 - alpha) * nx.sum(M * G) + return G, log + else: + return G + + +def entropic_semirelaxed_fused_gromov_wasserstein2( + M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, epsilon=0.1, + alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs): + r""" + Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] `) + + .. math:: + \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + + s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} + + \mathbf{T} &\geq 0 + + where : + + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix between features + - :math:`\mathbf{C_1}`: Metric cost matrix in the source space + - :math:`\mathbf{C_2}`: Metric cost matrix in the target space + - :math:`\mathbf{p}` source weights (sum to 1) + - `L` is a loss function to account for the misfit between the similarity matrices + + + .. note:: This function is backend-compatible and will work on arrays + from all compatible backends. However all the steps in the conditional + gradient are not differentiable. + + The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[48] ` + + Parameters + ---------- + M : array-like, shape (ns, nt) + Metric cost matrix between features across domains + C1 : array-like, shape (ns, ns) + Metric cost matrix representative of the structure in the source space. + C2 : array-like, shape (nt, nt) + Metric cost matrix representative of the structure in the target space. + p : array-like, shape (ns,), optional + Distribution in the source space. + If let to its default value None, uniform distribution is taken. + loss_fun : str, optional + loss function used for the solver either 'square_loss' or 'kl_loss'. + 'kl_loss' is not implemented yet and will raise an error. + epsilon : float + Regularization term >0 + symmetric : bool, optional + Either C1 and C2 are to be assumed symmetric or not. + If let to its default None value, a symmetry test will be conducted. + Else if set to True (resp. False), C1 and C2 will be assumed symmetric (resp. asymetric). + alpha : float, optional + Trade-off parameter (0 < alpha < 1) + G0: array-like, shape (ns,nt), optional + If None the initial transport plan of the solver is pq^T. + Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver. + max_iter : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error computed on transport plans + log : bool, optional + record log if True + verbose : bool, optional + Print information along iterations + **kwargs : dict + Parameters can be directly passed to the ot.optim.cg solver. + + Returns + ------- + srfgw-divergence : float + Semi-relaxed Fused gromov wasserstein divergence for the given parameters. + log : dict + Log dictionary return only if log==True in parameters. + + + .. _references-semirelaxed-fused-gromov-wasserstein2: + References + ---------- + .. [48] Cédric Vincent-Cuaz, Rémi Flamary, Marco Corneli, Titouan Vayer, Nicolas Courty. + "Semi-relaxed Gromov-Wasserstein divergence and applications on graphs" + International Conference on Learning Representations (ICLR), 2022. + """ + T, log_srfgw = entropic_semirelaxed_fused_gromov_wasserstein( + M, C1, C2, p, loss_fun, symmetric, epsilon, alpha, G0, + max_iter, tol, log=True, verbose=verbose, **kwargs) + + log_srfgw['T'] = T + + if log: + return log_srfgw['srfgw_dist'], log_srfgw + else: + return log_srfgw['srfgw_dist'] diff --git a/ot/gromov/_utils.py b/ot/gromov/_utils.py index ef8cd88..0b8bb00 100644 --- a/ot/gromov/_utils.py +++ b/ot/gromov/_utils.py @@ -324,6 +324,49 @@ def update_kl_loss(p, lambdas, T, Cs): return nx.exp(tmpsum / ppt) +def update_feature_matrix(lambdas, Ys, Ts, p): + r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings. + + + See "Solving the barycenter problem with Block Coordinate Descent (BCD)" + in :ref:`[24] ` calculated at each iteration + + Parameters + ---------- + p : array-like, shape (N,) + masses in the targeted barycenter + lambdas : list of float + List of the `S` spaces' weights + Ts : list of S array-like, shape (ns,N) + The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration + Ys : list of S array-like, shape (d,ns) + The features. + + Returns + ------- + X : array-like, shape (`d`, `N`) + + + .. _references-update-feature-matrix: + References + ---------- + .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas + "Optimal Transport for structured data with application on graphs" + International Conference on Machine Learning (ICML). 2019. + """ + p = list_to_array(p) + Ts = list_to_array(*Ts) + Ys = list_to_array(*Ys) + nx = get_backend(*Ys, *Ts, p) + + p = 1. / p + tmpsum = sum([ + lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :] + for s in range(len(Ts)) + ]) + return tmpsum + + def init_matrix_semirelaxed(C1, C2, p, loss_fun='square_loss', nx=None): r"""Return loss matrices and tensors for semi-relaxed Gromov-Wasserstein fast computation -- cgit v1.2.3