From 9c6ac880d426b7577918b0c77bd74b3b01930ef6 Mon Sep 17 00:00:00 2001 From: ncassereau-idris <84033440+ncassereau-idris@users.noreply.github.com> Date: Wed, 3 Nov 2021 17:29:16 +0100 Subject: [MRG] Docs updates (#298) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bregman docs * sliced docs * docs partial * unbalanced docs * stochastic docs * plot docs * datasets docs * utils docs * dr docs * dr docs corrected * smooth docs * docs da * pep8 * docs gromov * more space after min and argmin * docs lp * bregman docs * bregman docs mistake corrected * pep8 Co-authored-by: Rémi Flamary --- ot/bregman.py | 236 ++++++++++++++------------ ot/da.py | 499 ++++++++++++++++++++++++++++++------------------------ ot/datasets.py | 12 +- ot/dr.py | 40 +++-- ot/gromov.py | 35 ++-- ot/lp/__init__.py | 100 ++++++----- ot/optim.py | 8 +- ot/partial.py | 283 +++++++++++++++++-------------- ot/plot.py | 10 +- ot/sliced.py | 7 +- ot/smooth.py | 179 ++++++++++++-------- ot/stochastic.py | 192 ++++++++++----------- ot/unbalanced.py | 206 +++++++++++----------- ot/utils.py | 118 ++++++------- 14 files changed, 1048 insertions(+), 877 deletions(-) diff --git a/ot/bregman.py b/ot/bregman.py index 786f151..cce52e2 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -33,7 +33,8 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -45,9 +46,9 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target - weights (histograms, both sum to 1) + weights (histograms, both sum to 1) .. note:: This function is backend-compatible and will work on arrays from all compatible backends. @@ -70,7 +71,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, version of the sinkhorn :py:func:`ot.bregman.screenkhorn` aim at providing a fast approximation of the Sinkhorn problem. For use of GPU and gradient computation with small number of iterations we strongly recommend the - :any:`ot.bregman.sinkhorn_log` solver that will no need to check for + :py:func:`ot.bregman.sinkhorn_log` solver that will no need to check for numerical problems. @@ -189,7 +190,8 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, The function solves the following optimization problem: .. math:: - W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -201,9 +203,9 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target - weights (histograms, both sum to 1) + weights (histograms, both sum to 1) .. note:: This function is backend-compatible and will work on arrays from all compatible backends. @@ -217,17 +219,17 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, By default and when using a regularization parameter that is not too small the default sinkhorn solver should be enough. If you need to use a small regularization to get sharper OT matrices, you should use the - :any:`ot.bregman.sinkhorn_log` solver that will avoid numerical + :py:func:`ot.bregman.sinkhorn_log` solver that will avoid numerical errors. This last solver can be very slow in practice and might not even converge to a reasonable OT matrix in a finite time. This is why - :any:`ot.bregman.sinkhorn_epsilon_scaling` that relies on iterating the value + :py:func:`ot.bregman.sinkhorn_epsilon_scaling` that relies on iterating the value of the regularization (and using warm start) sometimes leads to better solutions. Note that the greedy version of the sinkhorn - :any:`ot.bregman.greenkhorn` can also lead to a speedup and the screening - version of the sinkhorn :any:`ot.bregman.screenkhorn` aim a providing a + :py:func:`ot.bregman.greenkhorn` can also lead to a speedup and the screening + version of the sinkhorn :py:func:`ot.bregman.screenkhorn` aim a providing a fast approximation of the Sinkhorn problem. For use of GPU and gradient computation with small number of iterations we strongly recommend the - :any:`ot.bregman.sinkhorn_log` solver that will no need to check for + :py:func:`ot.bregman.sinkhorn_log` solver that will no need to check for numerical problems. Parameters @@ -301,15 +303,15 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, divergences. In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 2681-2690). PMLR. + See Also -------- ot.lp.emd : Unregularized OT ot.optim.cg : General regularized OT ot.bregman.sinkhorn_knopp : Classic Sinkhorn :ref:`[2] ` ot.bregman.greenkhorn : Greenkhorn :ref:`[21] ` - ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn :ref:`[9] ` - :ref:`[10] ` - + ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn + :ref:`[9] ` :ref:`[10] ` """ M, a, b = list_to_array(M, a, b) @@ -362,7 +364,8 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -373,9 +376,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9, - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target - weights (histograms, both sum to 1) + weights (histograms, both sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[2] ` @@ -543,7 +546,8 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -553,12 +557,13 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False, where : - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\Omega` is the entropic regularization term + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (histograms, both sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix - scaling algorithm :ref:`[2] ` with the - implementation from :ref:`[34] ` + scaling algorithm :ref:`[2] ` with the + implementation from :ref:`[34] ` Parameters @@ -744,7 +749,8 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -755,9 +761,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target - weights (histograms, both sum to 1) + weights (histograms, both sum to 1) Parameters @@ -903,7 +909,8 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -914,9 +921,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target - weights (histograms, both sum to 1) + weights (histograms, both sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix @@ -1145,20 +1152,24 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, Solve the entropic regularization optimal transport problem with log stabilization and epsilon scaling. The function solves the following optimization problem: + .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} \gamma^T \mathbf{1} &= \mathbf{b} \gamma &\geq 0 + where : + - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights - (histograms, both sum to 1) + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (histograms, both sum to 1) + The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[2] ` but with the log stabilization @@ -1340,17 +1351,17 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000, The function solves the following optimization problem: .. math:: - \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) where : - - :math:`OT_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein - distance (see :py:func:`ot.bregman.sinkhorn`) - if `method` is `sinkhorn` or `sinkhorn_stabilized` or `sinkhorn_log`. + - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein + distance (see :py:func:`ot.bregman.sinkhorn`) + if `method` is `sinkhorn` or `sinkhorn_stabilized` or `sinkhorn_log`. - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` + :math:`\mathbf{A}` - `reg` and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + the cost matrix for OT The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[3] ` @@ -1424,16 +1435,16 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000, The function solves the following optimization problem: .. math:: - \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) where : - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance - (see :py:func:`ot.bregman.sinkhorn`) + (see :py:func:`ot.bregman.sinkhorn`) - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` + :math:`\mathbf{A}` - `reg` and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + the cost matrix for OT The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[3]`. @@ -1598,16 +1609,16 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000, The function solves the following optimization problem: .. math:: - \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) where : - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein - distance (see :py:func:`ot.bregman.sinkhorn`) + distance (see :py:func:`ot.bregman.sinkhorn`) - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` + :math:`\mathbf{A}` - `reg` and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + the cost matrix for OT The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[3] ` @@ -1736,24 +1747,24 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1 The function solves the following optimization problem: .. math:: - \mathbf{a} = arg\min_\mathbf{a} \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i) where : - :math:`S_{reg}(\cdot,\cdot)` is the debiased Sinkhorn divergence - (see :py:func:`ot.bregman.emirical_sinkhorn_divergence`) + (see :py:func:`ot.bregman.empirical_sinkhorn_divergence`) - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` + :math:`\mathbf{A}` - `reg` and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + the cost matrix for OT The algorithm used for solving the problem is the debiased Sinkhorn - algorithm as proposed in :ref:`[37] ` + algorithm as proposed in :ref:`[37] ` Parameters ---------- A : array-like, shape (dim, n_hists) - `n_hists` training distributions :math:`a_i` of size `dim` + `n_hists` training distributions :math:`\mathbf{a}_i` of size `dim` M : array-like, shape (dim, dim) loss matrix for OT reg : float @@ -1761,7 +1772,7 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1 method : str (optional) method used for the solver either 'sinkhorn' or 'sinkhorn_log' weights : array-like, shape (n_hists,) - Weights of each histogram :math:`a_i` on the simplex (barycentric coodinates) + Weights of each histogram :math:`\mathbf{a}_i` on the simplex (barycentric coodinates) numItermax : int, optional Max number of iterations stopThr : float, optional @@ -1774,7 +1785,6 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1 if True, raises a warning if the algorithm doesn't convergence. - Returns ------- a : (dim,) array-like @@ -1782,12 +1792,12 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1 log : dict log dictionary return only if log==True in parameters - .. _references-sinkhorn-debiased: - References - ---------- - .. [37] Janati, H., Cuturi, M., Gramfort, A. Proceedings of the 37th International - Conference on Machine Learning, PMLR 119:4692-4701, 2020 + .. _references-barycenter-debiased: + References + ---------- + .. [37] Janati, H., Cuturi, M., Gramfort, A. Proceedings of the 37th International + Conference on Machine Learning, PMLR 119:4692-4701, 2020 """ if method.lower() == 'sinkhorn': @@ -1934,20 +1944,20 @@ def _barycenter_debiased_log(A, M, reg, weights=None, numItermax=1000, def convolutional_barycenter2d(A, reg, weights=None, method="sinkhorn", numItermax=10000, stopThr=1e-4, verbose=False, log=False, warn=True, **kwargs): - r"""Compute the entropic regularized wasserstein barycenter of distributions A - where A is a collection of 2D images. + r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}` + where :math:`\mathbf{A}` is a collection of 2D images. The function solves the following optimization problem: .. math:: - \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i) where : - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein - distance (see :py:func:`ot.bregman.sinkhorn`) + distance (see :py:func:`ot.bregman.sinkhorn`) - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two dimensions - of matrix :math:`\mathbf{A}` + of matrix :math:`\mathbf{A}` - `reg` is the regularization strength scalar value The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm @@ -2166,24 +2176,24 @@ def convolutional_barycenter2d_debiased(A, reg, weights=None, method="sinkhorn", numItermax=10000, stopThr=1e-3, verbose=False, log=False, warn=True, **kwargs): - r"""Compute the debiased sinkhorn barycenter of distributions A - where A is a collection of 2D images. + r"""Compute the debiased sinkhorn barycenter of distributions :math:`\mathbf{A}` + where :math:`\mathbf{A}` is a collection of 2D images. The function solves the following optimization problem: .. math:: - \mathbf{a} = arg\min_\mathbf{a} \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i) where : - :math:`S_{reg}(\cdot,\cdot)` is the debiased entropic regularized Wasserstein - distance (see :py:func:`ot.bregman.sinkhorn_debiased`) + distance (see :py:func:`ot.bregman.barycenter_debiased`) - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two - dimensions of matrix :math:`\mathbf{A}` + dimensions of matrix :math:`\mathbf{A}` - `reg` is the regularization strength scalar value The algorithm used for solving the problem is the debiased Sinkhorn scaling - algorithm as proposed in :ref:`[37] ` + algorithm as proposed in :ref:`[37] ` Parameters ---------- @@ -2217,7 +2227,7 @@ def convolutional_barycenter2d_debiased(A, reg, weights=None, method="sinkhorn", log dictionary return only if log==True in parameters - .. _references-sinkhorn-debiased: + .. _references-convolutional-barycenter2d-debiased: References ---------- @@ -2406,23 +2416,25 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, .. math:: - \mathbf{h} = \mathop{\arg \min}_\mathbf{h} (1- \alpha) W_{\mathbf{M}, \mathrm{reg}}(\mathbf{a},\mathbf{Dh})+\alpha W_{\mathbf{M_0},\mathrm{reg}_0}(\mathbf{h}_0,\mathbf{h}) + \mathbf{h} = \mathop{\arg \min}_\mathbf{h} \quad + (1 - \alpha) W_{\mathbf{M}, \mathrm{reg}}(\mathbf{a}, \mathbf{Dh}) + + \alpha W_{\mathbf{M_0}, \mathrm{reg}_0}(\mathbf{h}_0, \mathbf{h}) where : - :math:`W_{M,reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance - with M loss matrix (see :py:func:`ot.bregman.sinkhorn`) + with :math:`\mathbf{M}` loss matrix (see :py:func:`ot.bregman.sinkhorn`) - :math:`\mathbf{D}` is a dictionary of `n_atoms` atoms of dimension `dim_a`, - its expected shape is `(dim_a, n_atoms)` + its expected shape is `(dim_a, n_atoms)` - :math:`\mathbf{h}` is the estimated unmixing of dimension `n_atoms` - :math:`\mathbf{a}` is an observed distribution of dimension `dim_a` - :math:`\mathbf{h}_0` is a prior on :math:`\mathbf{h}` of dimension `dim_prior` - `reg` and :math:`\mathbf{M}` are respectively the regularization term and the - cost matrix (`dim_a`, `dim_a`) for OT data fitting + cost matrix (`dim_a`, `dim_a`) for OT data fitting - `reg`:math:`_0` and :math:`\mathbf{M_0}` are respectively the regularization - term and the cost matrix (`dim_prior`, `n_atoms`) regularization - - :math:`\\alpha` weight data fitting and regularization + term and the cost matrix (`dim_prior`, `n_atoms`) regularization + - :math:`\alpha` weight data fitting and regularization The optimization problem is solved following the algorithm described in :ref:`[4] ` @@ -2535,7 +2547,7 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100, .. math:: - \mathbf{h} = \mathop{\arg \min}_{\mathbf{h}} \sum_{k=1}^{K} \lambda_k + \mathbf{h} = \mathop{\arg \min}_{\mathbf{h}} \quad \sum_{k=1}^{K} \lambda_k W_{reg}((\mathbf{D}_2^{(k)} \mathbf{h})^T, \mathbf{a}) s.t. \ \forall k, \mathbf{D}_1^{(k)} \gamma_k \mathbf{1}_n= \mathbf{h} @@ -2544,15 +2556,15 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100, - :math:`\lambda_k` is the weight of `k`-th source domain - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance - (see :py:func:`ot.bregman.sinkhorn`) + (see :py:func:`ot.bregman.sinkhorn`) - :math:`\mathbf{D}_2^{(k)}` is a matrix of weights related to `k`-th source domain - defined as in [p. 5, :ref:`27 `], its expected shape - is :math:`(n_k, C)` where :math:`n_k` is the number of elements in the `k`-th source - domain and `C` is the number of classes + defined as in [p. 5, :ref:`27 `], its expected shape + is :math:`(n_k, C)` where :math:`n_k` is the number of elements in the `k`-th source + domain and `C` is the number of classes - :math:`\mathbf{h}` is a vector of estimated proportions in the target domain of size `C` - :math:`\mathbf{a}` is a uniform vector of weights in the target domain of size `n` - :math:`\mathbf{D}_1^{(k)}` is a matrix of class assignments defined as in - [p. 5, :ref:`27 `], its expected shape is :math:`(n_k, C)` + [p. 5, :ref:`27 `], its expected shape is :math:`(n_k, C)` The problem consist in solving a Wasserstein barycenter problem to estimate the proportions :math:`\mathbf{h}` in the target domain. @@ -2714,18 +2726,19 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \ \gamma \mathbf{1} &= a + s.t. \ \gamma \mathbf{1} &= \mathbf{a} - \gamma^T \mathbf{1} &= b + \gamma^T \mathbf{1} &= \mathbf{b} \gamma &\geq 0 where : - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) @@ -2900,18 +2913,19 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', The function solves the following optimization problem: .. math:: - W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma) + W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \ \gamma \mathbf{1} &= a + s.t. \ \gamma \mathbf{1} &= \mathbf{a} - \gamma^T \mathbf{1} &= b + \gamma^T \mathbf{1} &= \mathbf{b} \gamma &\geq 0 where : - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) @@ -3055,18 +3069,21 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli .. math:: - W &= \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma) + W &= \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - W_a &= \min_{\gamma_a} <\gamma_a, \mathbf{M_a}>_F + \mathrm{reg} \cdot\Omega(\gamma_a) + W_a &= \min_{\gamma_a} \quad \langle \gamma_a, \mathbf{M_a} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma_a) - W_b &= \min_{\gamma_b} <\gamma_b, \mathbf{M_b}>_F + \mathrm{reg} \cdot\Omega(\gamma_b) + W_b &= \min_{\gamma_b} \quad \langle \gamma_b, \mathbf{M_b} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma_b) S &= W - \frac{W_a + W_b}{2} .. math:: - s.t. \ \gamma \mathbf{1} &= a + s.t. \ \gamma \mathbf{1} &= \mathbf{a} - \gamma^T \mathbf{1} &= b + \gamma^T \mathbf{1} &= \mathbf{b} \gamma &\geq 0 @@ -3084,10 +3101,10 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli where : - :math:`\mathbf{M}` (resp. :math:`\mathbf{M_a}`, :math:`\mathbf{M_b}`) - is the (`n_samples_a`, `n_samples_b`) metric cost matrix - (resp (`n_samples_a, n_samples_a`) and (`n_samples_b`, `n_samples_b`)) + is the (`n_samples_a`, `n_samples_b`) metric cost matrix + (resp (`n_samples_a, n_samples_a`) and (`n_samples_b`, `n_samples_b`)) - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) @@ -3198,7 +3215,10 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, .. math:: - (\mathbf{u}, \mathbf{v}) = \mathop{\arg \min}_{\mathbf{u}, \mathbf{v}} \ \mathbf{1}_{ns}^T \mathbf{B}(\mathbf{u}, \mathbf{v}) \mathbf{1}_{nt} - <\kappa \mathbf{u}, \mathbf{a}> - <\mathbf{v} / \kappa, \mathbf{b}> + (\mathbf{u}, \mathbf{v}) = \mathop{\arg \min}_{\mathbf{u}, \mathbf{v}} \quad + \mathbf{1}_{ns}^T \mathbf{B}(\mathbf{u}, \mathbf{v}) \mathbf{1}_{nt} - + \langle \kappa \mathbf{u}, \mathbf{a} \rangle - + \langle \frac{1}{\kappa} \mathbf{v}, \mathbf{b} \rangle where: @@ -3249,13 +3269,15 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, If `True`, display informations about the cardinals of the active sets and the parameters kappa and epsilon - Dependency - ---------- - To gain more efficiency, screenkhorn needs to call the "Bottleneck" - package (https://pypi.org/project/Bottleneck/) - in the screening pre-processing step. If Bottleneck isn't installed, - the following error message appears: - "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/" + + .. admonition:: Dependency + + To gain more efficiency, :py:func:`ot.bregman.screenkhorn` needs to call the "Bottleneck" + package (https://pypi.org/project/Bottleneck/) in the screening pre-processing step. + + If Bottleneck isn't installed, the following error message appears: + + "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/" Returns diff --git a/ot/da.py b/ot/da.py index cdc747c..4fd97df 100644 --- a/ot/da.py +++ b/ot/da.py @@ -33,27 +33,29 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10, The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma) - + \eta \Omega_g(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot \Omega_e(\gamma) + \eta \ \Omega_g(\gamma) - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} + + \gamma^T \mathbf{1} = \mathbf{b} + + \gamma \geq 0 - \gamma^T 1= b - \gamma\geq 0 where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e (\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\Omega_g` is the group lasso regularization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1` - where :math:`\mathcal{I}_c` are the index of samples from class c + where :math:`\mathcal{I}_c` are the index of samples from class `c` in the source domain. - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the problem is the generalized conditional - gradient as proposed in [5]_ [7]_ + gradient as proposed in :ref:`[5, 7] `. Parameters @@ -84,19 +86,20 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10, Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters + .. _references-sinkhorn-lpl1-mm: References ---------- - .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. @@ -144,27 +147,29 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10, The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ - \eta \Omega_g(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot \Omega_e(\gamma) + \eta \ \Omega_g(\gamma) + + s.t. \ \gamma \mathbf{1} = \mathbf{a} - s.t. \gamma 1 = a + \gamma^T \mathbf{1} = \mathbf{b} + + \gamma \geq 0 - \gamma^T 1= b - \gamma\geq 0 where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\Omega_g` is the group lasso regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^2` where :math:`\mathcal{I}_c` are the index of samples from class - c in the source domain. - - a and b are source and target weights (sum to 1) + `c` in the source domain. + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the problem is the generalised conditional - gradient as proposed in [5]_ [7]_ + gradient as proposed in :ref:`[5, 7] `. Parameters @@ -195,18 +200,19 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10, Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters + .. _references-sinkhorn-l1l2-gl: References ---------- - .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. @@ -245,38 +251,40 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False, verbose2=False, numItermax=100, numInnerItermax=10, stopInnerThr=1e-6, stopThr=1e-5, log=False, **kwargs): - r"""Joint OT and linear mapping estimation as proposed in [8] + r"""Joint OT and linear mapping estimation as proposed in + :ref:`[8] `. The function solves the following optimization problem: .. math:: - \min_{\gamma,L}\quad \|L(X_s) -n_s\gamma X_t\|^2_F + - \mu<\gamma,M>_F + \eta \|L -I\|^2_F + \min_{\gamma,L}\quad \|L(\mathbf{X_s}) - n_s\gamma \mathbf{X_t} \|^2_F + + \mu \langle \gamma, \mathbf{M} \rangle_F + \eta \|L - \mathbf{I}\|^2_F + + s.t. \ \gamma \mathbf{1} = \mathbf{a} - s.t. \gamma 1 = a + \gamma^T \mathbf{1} = \mathbf{b} - \gamma^T 1= b + \gamma \geq 0 - \gamma\geq 0 where : - - M is the (ns,nt) squared euclidean cost matrix between samples in - Xs and Xt (scaled by ns) - - :math:`L` is a dxd linear operator that approximates the barycentric + - :math:`\mathbf{M}` is the (`ns`, `nt`) squared euclidean cost matrix between samples in + :math:`\mathbf{X_s}` and :math:`\mathbf{X_t}` (scaled by :math:`n_s`) + - :math:`L` is a :math:`d\times d` linear operator that approximates the barycentric mapping - - :math:`I` is the identity matrix (neutral linear mapping) - - a and b are uniform source and target weights + - :math:`\mathbf{I}` is the identity matrix (neutral linear mapping) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are uniform source and target weights The problem consist in solving jointly an optimal transport matrix :math:`\gamma` and a linear mapping that fits the barycentric mapping - :math:`n_s\gamma X_t`. + :math:`n_s\gamma \mathbf{X_t}`. One can also estimate a mapping with constant bias (see supplementary - material of [8]) using the bias optional argument. + material of :ref:`[8] `) using the bias optional argument. The algorithm used for solving the problem is the block coordinate - descent that alternates between updates of G (using conditionnal gradient) - and the update of L using a classical least square solver. + descent that alternates between updates of :math:`\mathbf{G}` (using conditionnal gradient) + and the update of :math:`\mathbf{L}` using a classical least square solver. Parameters @@ -307,17 +315,17 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False, Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters - L : (d x d) ndarray - Linear mapping matrix (d+1 x d if bias) + L : (d, d) ndarray + Linear mapping matrix ((:math:`d+1`, `d`) if bias) log : dict log dictionary return only if log==True in parameters + .. _references-joint-OT-mapping-linear: References ---------- - .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard, "Mapping estimation for discrete optimal transport", Neural Information Processing Systems (NIPS), 2016. @@ -434,37 +442,41 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', numItermax=100, numInnerItermax=10, stopInnerThr=1e-6, stopThr=1e-5, log=False, **kwargs): - r"""Joint OT and nonlinear mapping estimation with kernels as proposed in [8] + r"""Joint OT and nonlinear mapping estimation with kernels as proposed in + :ref:`[8] `. The function solves the following optimization problem: .. math:: - \min_{\gamma,L\in\mathcal{H}}\quad \|L(X_s) - - n_s\gamma X_t\|^2_F + \mu<\gamma,M>_F + \eta \|L\|^2_\mathcal{H} + \min_{\gamma, L\in\mathcal{H}}\quad \|L(\mathbf{X_s}) - + n_s\gamma \mathbf{X_t}\|^2_F + \mu \langle \gamma, \mathbf{M} \rangle_F + + \eta \|L\|^2_\mathcal{H} + + s.t. \ \gamma \mathbf{1} = \mathbf{a} + + \gamma^T \mathbf{1} = \mathbf{b} - s.t. \gamma 1 = a + \gamma \geq 0 - \gamma^T 1= b - \gamma\geq 0 where : - - M is the (ns,nt) squared euclidean cost matrix between samples in - Xs and Xt (scaled by ns) - - :math:`L` is a ns x d linear operator on a kernel matrix that + - :math:`\mathbf{M}` is the (`ns`, `nt`) squared euclidean cost matrix between samples in + :math:`\mathbf{X_s}` and :math:`\mathbf{X_t}` (scaled by :math:`n_s`) + - :math:`L` is a :math:`n_s \times d` linear operator on a kernel matrix that approximates the barycentric mapping - - a and b are uniform source and target weights + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are uniform source and target weights The problem consist in solving jointly an optimal transport matrix :math:`\gamma` and the nonlinear mapping that fits the barycentric mapping - :math:`n_s\gamma X_t`. + :math:`n_s\gamma \mathbf{X_t}`. One can also estimate a mapping with constant bias (see supplementary - material of [8]) using the bias optional argument. + material of :ref:`[8] `) using the bias optional argument. The algorithm used for solving the problem is the block coordinate - descent that alternates between updates of G (using conditionnal gradient) - and the update of L using a classical kernel least square solver. + descent that alternates between updates of :math:`\mathbf{G}` (using conditionnal gradient) + and the update of :math:`\mathbf{L}` using a classical kernel least square solver. Parameters @@ -478,7 +490,7 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', eta : float, optional Regularization term for the linear mapping L (>0) kerneltype : str,optional - kernel used by calling function ot.utils.kernel (gaussian by default) + kernel used by calling function :py:func:`ot.utils.kernel` (gaussian by default) sigma : float, optional Gaussian kernel bandwidth. bias : bool,optional @@ -501,17 +513,17 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters - L : (ns x d) ndarray - Nonlinear mapping matrix (ns+1 x d if bias) + L : (ns, d) ndarray + Nonlinear mapping matrix ((:math:`n_s+1`, `d`) if bias) log : dict log dictionary return only if log==True in parameters + .. _references-joint-OT-mapping-kernel: References ---------- - .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard, "Mapping estimation for discrete optimal transport", Neural Information Processing Systems (NIPS), 2016. @@ -645,26 +657,27 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', def OT_mapping_linear(xs, xt, reg=1e-6, ws=None, wt=None, bias=True, log=False): - r""" return OT linear operator between samples + r"""Return OT linear operator between samples. The function estimates the optimal linear operator that aligns the two empirical distributions. This is equivalent to estimating the closed - form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)` - and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark - 2.29 in [15]. + form mapping between two Gaussian distributions :math:`\mathcal{N}(\mu_s,\Sigma_s)` + and :math:`\mathcal{N}(\mu_t,\Sigma_t)` as proposed in + :ref:`[14] ` and discussed in remark 2.29 in + :ref:`[15] `. The linear operator from source to target :math:`M` .. math:: - M(x)=Ax+b + M(\mathbf{x})= \mathbf{A} \mathbf{x} + \mathbf{b} where : .. math:: - A=\Sigma_s^{-1/2}(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2})^{1/2} + \mathbf{A} &= \Sigma_s^{-1/2} \left(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2} \right)^{1/2} \Sigma_s^{-1/2} - .. math:: - b=\mu_t-A\mu_s + + \mathbf{b} &= \mu_t - \mathbf{A} \mu_s Parameters ---------- @@ -673,35 +686,35 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None, xt : np.ndarray (nt,d) samples in the target domain reg : float,optional - regularization added to the diagonals of convariances (>0) + regularization added to the diagonals of covariances (>0) ws : np.ndarray (ns,1), optional weights for the source samples wt : np.ndarray (ns,1), optional weights for the target samples bias: boolean, optional - estimate bias b else b=0 (default:True) + estimate bias :math:`\mathbf{b}` else :math:`\mathbf{b} = 0` (default:True) log : bool, optional record log if True Returns ------- - A : (d x d) ndarray + A : (d, d) ndarray Linear operator - b : (1 x d) ndarray + b : (1, d) ndarray bias log : dict log dictionary return only if log==True in parameters + .. _references-OT-mapping-linear: References ---------- - .. [14] Knott, M. and Smith, C. S. "On the optimal mapping of distributions", Journal of Optimization Theory and Applications Vol 43, 1984 - .. [15] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + .. [15] Peyré, G., & Cuturi, M. (2017). "Computational Optimal Transport", 2018. @@ -754,24 +767,34 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al r"""Solve the optimal transport problem (OT) with Laplacian regularization .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + eta\Omega_\alpha(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \eta \cdot \Omega_\alpha(\gamma) - s.t.\ \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} - \gamma\geq 0 + \gamma \geq 0 where: - - a and b are source and target weights (sum to 1) - - xs and xt are source and target samples - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) + - :math:`\mathbf{x_s}` and :math:`\mathbf{x_t}` are source and target samples + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega_\alpha` is the Laplacian regularization term - :math:`\Omega_\alpha = (1-\alpha)/n_s^2\sum_{i,j}S^s_{i,j}\|T(\mathbf{x}^s_i)-T(\mathbf{x}^s_j)\|^2+\alpha/n_t^2\sum_{i,j}S^t_{i,j}^'\|T(\mathbf{x}^t_i)-T(\mathbf{x}^t_j)\|^2` - with :math:`S^s_{i,j}, S^t_{i,j}` denoting source and target similarity matrices and :math:`T(\cdot)` being a barycentric mapping - The algorithm used for solving the problem is the conditional gradient algorithm as proposed in [5]. + .. math:: + \Omega_\alpha = \frac{1 - \alpha}{n_s^2} \sum_{i,j} + \mathbf{S^s}_{i,j} \|T(\mathbf{x}^s_i) - T(\mathbf{x}^s_j) \|^2 + + \frac{\alpha}{n_t^2} \sum_{i,j} + \mathbf{S^t}_{i,j} \|T(\mathbf{x}^t_i) - T(\mathbf{x}^t_j) \|^2 + + + with :math:`\mathbf{S^s}_{i,j}, \mathbf{S^t}_{i,j}` denoting source and target similarity + matrices and :math:`T(\cdot)` being a barycentric mapping. + + The algorithm used for solving the problem is the conditional gradient algorithm as proposed in + :ref:`[5] `. Parameters ---------- @@ -811,22 +834,23 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters + .. _references-emd-laplace: References ---------- - .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE - Transactions on Pattern Analysis and Machine Intelligence , + Transactions on Pattern Analysis and Machine Intelligence, vol.PP, no.99, pp.1-1 + .. [30] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy, "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching," - in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. + in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. See Also -------- @@ -882,7 +906,7 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al def distribution_estimation_uniform(X): - """estimates a uniform distribution from an array of samples X + """estimates a uniform distribution from an array of samples :math:`\mathbf{X}` Parameters ---------- @@ -892,7 +916,7 @@ def distribution_estimation_uniform(X): Returns ------- mu : array-like, shape (n_samples,) - The uniform distribution estimated from X + The uniform distribution estimated from :math:`\mathbf{X}` """ return unif(X.shape[0]) @@ -902,32 +926,32 @@ class BaseTransport(BaseEstimator): """Base class for OTDA objects - Notes - ----- - All estimators should specify all the parameters that can be set - at the class level in their ``__init__`` as explicit keyword - arguments (no ``*args`` or ``**kwargs``). + .. note:: + All estimators should specify all the parameters that can be set + at the class level in their ``__init__`` as explicit keyword + arguments (no ``*args`` or ``**kwargs``). - the fit method should: + The fit method should: - estimate a cost matrix and store it in a `cost_` attribute - - estimate a coupling matrix and store it in a `coupling_` - attribute + - estimate a coupling matrix and store it in a `coupling_` attribute - estimate distributions from source and target data and store them in - mu_s and mu_t attributes - - store Xs and Xt in attributes to be used later on in transform and - inverse_transform methods + `mu_s` and `mu_t` attributes + - store `Xs` and `Xt` in attributes to be used later on in `transform` and + `inverse_transform` methods + + `transform` method should always get as input a `Xs` parameter - transform method should always get as input a Xs parameter - inverse_transform method should always get as input a Xt parameter + `inverse_transform` method should always get as input a `Xt` parameter - transform_labels method should always get as input a ys parameter - inverse_transform_labels method should always get as input a yt parameter + `transform_labels` method should always get as input a `ys` parameter + + `inverse_transform_labels` method should always get as input a `yt` parameter """ def fit(self, Xs=None, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -938,8 +962,8 @@ class BaseTransport(BaseEstimator): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -987,8 +1011,8 @@ class BaseTransport(BaseEstimator): def fit_transform(self, Xs=None, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) and transports source samples Xs onto target - ones Xt + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` + and transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}` Parameters ---------- @@ -999,8 +1023,8 @@ class BaseTransport(BaseEstimator): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1014,7 +1038,7 @@ class BaseTransport(BaseEstimator): return self.fit(Xs, ys, Xt, yt).transform(Xs, ys, Xt, yt) def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports source samples Xs onto target ones Xt + """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}` Parameters ---------- @@ -1025,8 +1049,8 @@ class BaseTransport(BaseEstimator): Xt : array-like, shape (n_target_samples, n_features) The target input samples. yt : array-like, shape (n_target_samples,) - The class labels for target. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels for target. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1081,7 +1105,8 @@ class BaseTransport(BaseEstimator): return transp_Xs def transform_labels(self, ys=None): - """Propagate source labels ys to obtain estimated target labels as in [27] + """Propagate source labels :math:`\mathbf{y_s}` to obtain estimated target labels as in + :ref:`[27] `. Parameters ---------- @@ -1093,9 +1118,10 @@ class BaseTransport(BaseEstimator): transp_ys : array-like, shape (n_target_samples, nb_classes) Estimated soft target labels. + + .. _references-basetransport-transform-labels: References ---------- - .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia "Optimal transport for multi-source domain adaptation under target shift", International Conference on Artificial Intelligence and Statistics (AISTATS), 2019. @@ -1126,7 +1152,7 @@ class BaseTransport(BaseEstimator): def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports target samples Xt onto source samples Xs + """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}` Parameters ---------- @@ -1137,8 +1163,8 @@ class BaseTransport(BaseEstimator): Xt : array-like, shape (n_target_samples, n_features) The target input samples. yt : array-like, shape (n_target_samples,) - The target class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The target class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1192,7 +1218,8 @@ class BaseTransport(BaseEstimator): return transp_Xt def inverse_transform_labels(self, yt=None): - """Propagate target labels yt to obtain estimated source labels ys + """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels + :math:`\mathbf{y_s}` Parameters ---------- @@ -1232,35 +1259,37 @@ class LinearTransport(BaseTransport): The function estimates the optimal linear operator that aligns the two empirical distributions. This is equivalent to estimating the closed - form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)` - and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in - remark 2.29 in [15]. + form mapping between two Gaussian distributions :math:`\mathcal{N}(\mu_s,\Sigma_s)` + and :math:`\mathcal{N}(\mu_t,\Sigma_t)` as proposed in + :ref:`[14] ` and discussed in remark 2.29 in + :ref:`[15] `. The linear operator from source to target :math:`M` .. math:: - M(x)=Ax+b + M(\mathbf{x})= \mathbf{A} \mathbf{x} + \mathbf{b} where : .. math:: - A=\Sigma_s^{-1/2}(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2})^{1/2} + \mathbf{A} &= \Sigma_s^{-1/2} \left(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2} \right)^{1/2} \Sigma_s^{-1/2} - .. math:: - b=\mu_t-A\mu_s + + \mathbf{b} &= \mu_t - \mathbf{A} \mu_s Parameters ---------- reg : float,optional - regularization added to the daigonals of convariances (>0) + regularization added to the daigonals of covariances (>0) bias: boolean, optional - estimate bias b else b=0 (default:True) + estimate bias :math:`\mathbf{b}` else :math:`\mathbf{b} = 0` (default:True) log : bool, optional record log if True + + .. _references-lineartransport: References ---------- - .. [14] Knott, M. and Smith, C. S. "On the optimal mapping of distributions", Journal of Optimization Theory and Applications Vol 43, 1984 @@ -1279,7 +1308,7 @@ class LinearTransport(BaseTransport): def fit(self, Xs=None, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1290,8 +1319,8 @@ class LinearTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1325,7 +1354,7 @@ class LinearTransport(BaseTransport): return self def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports source samples Xs onto target ones Xt + """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}` Parameters ---------- @@ -1336,8 +1365,8 @@ class LinearTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1358,7 +1387,7 @@ class LinearTransport(BaseTransport): def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports target samples Xt onto target samples Xs + """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}` Parameters ---------- @@ -1369,8 +1398,8 @@ class LinearTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1392,7 +1421,7 @@ class LinearTransport(BaseTransport): class SinkhornTransport(BaseTransport): - """Domain Adapatation OT method based on Sinkhorn Algorithm + """Domain Adaptation OT method based on Sinkhorn Algorithm Parameters ---------- @@ -1400,7 +1429,7 @@ class SinkhornTransport(BaseTransport): Entropic regularization parameter max_iter : int, float, optional (default=1000) The minimum number of iteration before stopping the optimization - algorithm if no it has not converged + algorithm if it has not converged tol : float, optional (default=10e-9) The precision required to stop the optimization algorithm. verbose : bool, optional (default=False) @@ -1417,8 +1446,8 @@ class SinkhornTransport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. - limit_max: float, optional (defaul=np.infty) + "ferradans" which uses the method proposed in :ref:`[6] `. + limit_max: float, optional (default=np.infty) Controls the semi supervised mode. Transport between labeled source and target samples of different classes will exhibit an cost defined by this variable @@ -1428,16 +1457,20 @@ class SinkhornTransport(BaseTransport): coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling log_ : dictionary - The dictionary of log, empty dic if parameter log is not True + The dictionary of log, empty dict if parameter log is not True + + .. _references-sinkhorntransport: References ---------- .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882. @@ -1461,7 +1494,7 @@ class SinkhornTransport(BaseTransport): def fit(self, Xs=None, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1472,8 +1505,8 @@ class SinkhornTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1504,7 +1537,7 @@ class SinkhornTransport(BaseTransport): class EMDTransport(BaseTransport): - """Domain Adapatation OT method based on Earth Mover's Distance + """Domain Adaptation OT method based on Earth Mover's Distance Parameters ---------- @@ -1520,7 +1553,7 @@ class EMDTransport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. + "ferradans" which uses the method proposed in :ref:`[6] `. limit_max: float, optional (default=10) Controls the semi supervised mode. Transport between labeled source and target samples of different classes will exhibit an infinite cost @@ -1534,14 +1567,16 @@ class EMDTransport(BaseTransport): coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling + + .. _references-emdtransport: References ---------- .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, - "Optimal Transport for Domain Adaptation," in IEEE Transactions - on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + "Optimal Transport for Domain Adaptation," in IEEE Transactions + on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). - Regularized discrete optimal transport. SIAM Journal on Imaging - Sciences, 7(3), 1853-1882. + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, metric="sqeuclidean", norm=None, log=False, @@ -1558,7 +1593,7 @@ class EMDTransport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1569,8 +1604,8 @@ class EMDTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1597,8 +1632,7 @@ class EMDTransport(BaseTransport): class SinkhornLpl1Transport(BaseTransport): - - """Domain Adapatation OT method based on sinkhorn algorithm + + r"""Domain Adaptation OT method based on sinkhorn algorithm + LpL1 class regularization. Parameters @@ -1609,7 +1643,7 @@ class SinkhornLpl1Transport(BaseTransport): Class regularization parameter max_iter : int, float, optional (default=10) The minimum number of iteration before stopping the optimization - algorithm if no it has not converged + algorithm if it has not converged max_inner_iter : int, float, optional (default=200) The number of iteration in the inner loop log : bool, optional (default=False) @@ -1628,8 +1662,8 @@ class SinkhornLpl1Transport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. - limit_max: float, optional (defaul=np.infty) + "ferradans" which uses the method proposed in :ref:`[6] `. + limit_max: float, optional (default=np.infty) Controls the semi supervised mode. Transport between labeled source and target samples of different classes will exhibit a cost defined by limit_max. @@ -1639,16 +1673,19 @@ class SinkhornLpl1Transport(BaseTransport): coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling + + .. _references-sinkhornlpl1transport: References ---------- - .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882. @@ -1675,7 +1712,7 @@ class SinkhornLpl1Transport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1686,8 +1723,8 @@ class SinkhornLpl1Transport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1719,13 +1756,14 @@ class SinkhornLpl1Transport(BaseTransport): class EMDLaplaceTransport(BaseTransport): - """Domain Adapatation OT method based on Earth Mover's Distance with Laplacian regularization + """Domain Adaptation OT method based on Earth Mover's Distance with Laplacian regularization Parameters ---------- reg_type : string optional (default='pos') Type of the regularization term: 'pos' and 'disp' for - regularization term defined in [2] and [6], respectively. + regularization term defined in :ref:`[2] ` and + :ref:`[6] `, respectively. reg_lap : float, optional (default=1) Laplacian regularization parameter reg_src : float, optional (default=0.5) @@ -1756,24 +1794,27 @@ class EMDLaplaceTransport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. + "ferradans" which uses the method proposed in :ref:`[6] `. Attributes ---------- coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling + + .. _references-emdlaplacetransport: References ---------- .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [2] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy, "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching," - in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. + in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). - Regularized discrete optimal transport. SIAM Journal on Imaging - Sciences, 7(3), 1853-1882. + Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882. """ def __init__(self, reg_type='pos', reg_lap=1., reg_src=1., metric="sqeuclidean", @@ -1799,7 +1840,7 @@ class EMDLaplaceTransport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1810,8 +1851,8 @@ class EMDLaplaceTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1840,8 +1881,8 @@ class EMDLaplaceTransport(BaseTransport): class SinkhornL1l2Transport(BaseTransport): - """Domain Adapatation OT method based on sinkhorn algorithm + - l1l2 class regularization. + """Domain Adaptation OT method based on sinkhorn algorithm + + L1L2 class regularization. Parameters ---------- @@ -1851,7 +1892,7 @@ class SinkhornL1l2Transport(BaseTransport): Class regularization parameter max_iter : int, float, optional (default=10) The minimum number of iteration before stopping the optimization - algorithm if no it has not converged + algorithm if it has not converged max_inner_iter : int, float, optional (default=200) The number of iteration in the inner loop tol : float, optional (default=10e-9) @@ -1870,7 +1911,7 @@ class SinkhornL1l2Transport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. + "ferradans" which uses the method proposed in :ref:`[6] `. limit_max: float, optional (default=10) Controls the semi supervised mode. Transport between labeled source and target samples of different classes will exhibit an infinite cost @@ -1881,18 +1922,21 @@ class SinkhornL1l2Transport(BaseTransport): coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling log_ : dictionary - The dictionary of log, empty dic if parameter log is not True + The dictionary of log, empty dict if parameter log is not True + + .. _references-sinkhornl1l2transport: References ---------- - .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882. @@ -1919,7 +1963,7 @@ class SinkhornL1l2Transport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -1930,8 +1974,8 @@ class SinkhornL1l2Transport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -1973,7 +2017,7 @@ class MappingTransport(BaseEstimator): mu : float, optional (default=1) Weight for the linear OT loss (>0) eta : float, optional (default=0.001) - Regularization term for the linear mapping L (>0) + Regularization term for the linear mapping `L` (>0) bias : bool, optional (default=False) Estimate linear mapping with constant bias metric : string, optional (default="sqeuclidean") @@ -2004,17 +2048,20 @@ class MappingTransport(BaseEstimator): ---------- coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling - mapping_ : array-like, shape (n_features (+ 1), n_features) - (if bias) for kernel == linear + mapping_ : The associated mapping - array-like, shape (n_source_samples (+ 1), n_features) - (if bias) for kernel == gaussian + + - array-like, shape (`n_features` (+ 1), `n_features`), + (if bias) for kernel == linear + + - array-like, shape (`n_source_samples` (+ 1), `n_features`), + (if bias) for kernel == gaussian log_ : dictionary - The dictionary of log, empty dic if parameter log is not True + The dictionary of log, empty dict if parameter log is not True + References ---------- - .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard, "Mapping estimation for discrete optimal transport", Neural Information Processing Systems (NIPS), 2016. @@ -2042,7 +2089,8 @@ class MappingTransport(BaseEstimator): def fit(self, Xs=None, ys=None, Xt=None, yt=None): """Builds an optimal coupling and estimates the associated mapping - from source and target sets of samples (Xs, ys) and (Xt, yt) + from source and target sets of samples + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -2053,8 +2101,8 @@ class MappingTransport(BaseEstimator): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -2098,7 +2146,7 @@ class MappingTransport(BaseEstimator): return self def transform(self, Xs): - """Transports source samples Xs onto target ones Xt + """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}` Parameters ---------- @@ -2138,7 +2186,7 @@ class MappingTransport(BaseEstimator): class UnbalancedSinkhornTransport(BaseTransport): - """Domain Adapatation unbalanced OT method based on sinkhorn algorithm + """Domain Adaptation unbalanced OT method based on sinkhorn algorithm Parameters ---------- @@ -2151,7 +2199,7 @@ class UnbalancedSinkhornTransport(BaseTransport): 'sinkhorn_epsilon_scaling', see those function for specific parameters max_iter : int, float, optional (default=10) The minimum number of iteration before stopping the optimization - algorithm if no it has not converged + algorithm if it has not converged tol : float, optional (default=10e-9) Stop threshold on error (inner sinkhorn solver) (>0) verbose : bool, optional (default=False) @@ -2168,7 +2216,7 @@ class UnbalancedSinkhornTransport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. + "ferradans" which uses the method proposed in :ref:`[6] `. limit_max: float, optional (default=10) Controls the semi supervised mode. Transport between labeled source and target samples of different classes will exhibit an infinite cost @@ -2179,14 +2227,16 @@ class UnbalancedSinkhornTransport(BaseTransport): coupling_ : array-like, shape (n_source_samples, n_target_samples) The optimal coupling log_ : dictionary - The dictionary of log, empty dic if parameter log is not True + The dictionary of log, empty dict if parameter log is not True + + .. _references-unbalancedsinkhorntransport: References ---------- - .. [1] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). - Scaling algorithms for unbalanced transport problems. arXiv preprint - arXiv:1607.05816. + Scaling algorithms for unbalanced transport problems. arXiv preprint + arXiv:1607.05816. + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882. @@ -2212,7 +2262,7 @@ class UnbalancedSinkhornTransport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Build a coupling matrix from source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -2223,8 +2273,8 @@ class UnbalancedSinkhornTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -2258,7 +2308,7 @@ class UnbalancedSinkhornTransport(BaseTransport): class JCPOTTransport(BaseTransport): - """Domain Adapatation OT method for multi-source target shift based on Wasserstein barycenter algorithm. + """Domain Adaptation OT method for multi-source target shift based on Wasserstein barycenter algorithm. Parameters ---------- @@ -2266,7 +2316,7 @@ class JCPOTTransport(BaseTransport): Entropic regularization parameter max_iter : int, float, optional (default=10) The minimum number of iteration before stopping the optimization - algorithm if no it has not converged + algorithm if it has not converged tol : float, optional (default=10e-9) Stop threshold on error (inner sinkhorn solver) (>0) verbose : bool, optional (default=False) @@ -2283,7 +2333,7 @@ class JCPOTTransport(BaseTransport): out_of_sample_map : string, optional (default="ferradans") The kind of out of sample mapping to apply to transport samples from a domain into another one. Currently the only possible option is - "ferradans" which uses the method proposed in [6]. + "ferradans" which uses the method proposed in :ref:`[6] `. Attributes ---------- @@ -2292,11 +2342,12 @@ class JCPOTTransport(BaseTransport): proportions_ : array-like, shape (n_classes,) Estimated class proportions in the target domain log_ : dictionary - The dictionary of log, empty dic if parameter log is not True + The dictionary of log, empty dict if parameter log is not True + + .. _references-jcpottransport: References ---------- - .. [1] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia "Optimal transport for multi-source domain adaptation under target shift", International Conference on Artificial Intelligence and Statistics (AISTATS), @@ -2323,7 +2374,7 @@ class JCPOTTransport(BaseTransport): def fit(self, Xs, ys=None, Xt=None, yt=None): """Building coupling matrices from a list of source and target sets of samples - (Xs, ys) and (Xt, yt) + :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})` Parameters ---------- @@ -2334,8 +2385,8 @@ class JCPOTTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -2368,7 +2419,7 @@ class JCPOTTransport(BaseTransport): return self def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports source samples Xs onto target ones Xt + """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}` Parameters ---------- @@ -2379,8 +2430,8 @@ class JCPOTTransport(BaseTransport): Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the - yt's elements with -1. + The class labels. If some target samples are unlabelled, fill the + :math:`\mathbf{y_t}`'s elements with -1. Warning: Note that, due to this convention -1 cannot be used as a class label @@ -2440,7 +2491,8 @@ class JCPOTTransport(BaseTransport): return transp_Xs def transform_labels(self, ys=None): - """Propagate source labels ys to obtain target labels as in [27] + """Propagate source labels :math:`\mathbf{y_s}` to obtain target labels as in + :ref:`[27] ` Parameters ---------- @@ -2451,6 +2503,14 @@ class JCPOTTransport(BaseTransport): ------- yt : array-like, shape (n_target_samples, nb_classes) Estimated soft target labels. + + + .. _references-jcpottransport-transform-labels: + References + ---------- + .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia + "Optimal transport for multi-source domain adaptation under target shift", + International Conference on Artificial Intelligence and Statistics (AISTATS), 2019. """ # check the necessary inputs parameters are here @@ -2482,11 +2542,12 @@ class JCPOTTransport(BaseTransport): return yt.T def inverse_transform_labels(self, yt=None): - """Propagate source labels ys to obtain target labels + """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels + :math:`\mathbf{y_s}` Parameters ---------- - yt : array-like, shape (n_source_samples,) + yt : array-like, shape (n_target_samples,) The target class labels Returns diff --git a/ot/datasets.py b/ot/datasets.py index b86ef3b..ad6390c 100644 --- a/ot/datasets.py +++ b/ot/datasets.py @@ -13,7 +13,7 @@ from .utils import check_random_state, deprecated def make_1D_gauss(n, m, s): - """return a 1D histogram for a gaussian distribution (n bins, mean m and std s) + """return a 1D histogram for a gaussian distribution (`n` bins, mean `m` and std `s`) Parameters ---------- @@ -26,7 +26,7 @@ def make_1D_gauss(n, m, s): Returns ------- - h : ndarray (n,) + h : ndarray (`n`,) 1D histogram for a gaussian distribution """ x = np.arange(n, dtype=np.float64) @@ -41,7 +41,7 @@ def get_1D_gauss(n, m, sigma): def make_2D_samples_gauss(n, m, sigma, random_state=None): - """Return n samples drawn from 2D gaussian N(m,sigma) + """Return `n` samples drawn from 2D gaussian :math:`\mathcal{N}(m, \sigma)` Parameters ---------- @@ -59,8 +59,8 @@ def make_2D_samples_gauss(n, m, sigma, random_state=None): Returns ------- - X : ndarray, shape (n, 2) - n samples drawn from N(m, sigma). + X : ndarray, shape (`n`, 2) + n samples drawn from :math:`\mathcal{N}(m, \sigma)`. """ generator = check_random_state(random_state) @@ -102,7 +102,7 @@ def make_data_classif(dataset, n, nz=.5, theta=0, p=.5, random_state=None, **kwa Returns ------- X : ndarray, shape (n, d) - n observation of size d + `n` observation of size `d` y : ndarray, shape (n,) labels of the samples. """ diff --git a/ot/dr.py b/ot/dr.py index 7469270..c2f51f8 100644 --- a/ot/dr.py +++ b/ot/dr.py @@ -22,7 +22,7 @@ from pymanopt.solvers import SteepestDescent, TrustRegions def dist(x1, x2): - """ Compute squared euclidean distance between samples (autograd) + r""" Compute squared euclidean distance between samples (autograd) """ x1p2 = np.sum(np.square(x1), 1) x2p2 = np.sum(np.square(x2), 1) @@ -30,7 +30,7 @@ def dist(x1, x2): def sinkhorn(w1, w2, M, reg, k): - """Sinkhorn algorithm with fixed number of iteration (autograd) + r"""Sinkhorn algorithm with fixed number of iteration (autograd) """ K = np.exp(-M / reg) ui = np.ones((M.shape[0],)) @@ -43,14 +43,14 @@ def sinkhorn(w1, w2, M, reg, k): def split_classes(X, y): - """split samples in X by classes in y + r"""split samples in :math:`\mathbf{X}` by classes in :math:`\mathbf{y}` """ lstsclass = np.unique(y) return [X[y == i, :].astype(np.float32) for i in lstsclass] def fda(X, y, p=2, reg=1e-16): - """Fisher Discriminant Analysis + r"""Fisher Discriminant Analysis Parameters ---------- @@ -111,18 +111,19 @@ def fda(X, y, p=2, reg=1e-16): def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, normalize=False): r""" - Wasserstein Discriminant Analysis [11]_ + Wasserstein Discriminant Analysis :ref:`[11] ` The function solves the following optimization problem: .. math:: - P = \\text{arg}\min_P \\frac{\\sum_i W(PX^i,PX^i)}{\\sum_{i,j\\neq i} W(PX^i,PX^j)} + \mathbf{P} = \mathop{\arg \min}_\mathbf{P} \quad + \frac{\sum\limits_i W(P \mathbf{X}^i, P \mathbf{X}^i)}{\sum\limits_{i, j \neq i} W(P \mathbf{X}^i, P \mathbf{X}^j)} where : - - :math:`P` is a linear projection operator in the Stiefel(p,d) manifold + - :math:`P` is a linear projection operator in the Stiefel(`p`, `d`) manifold - :math:`W` is entropic regularized Wasserstein distances - - :math:`X^i` are samples in the dataset corresponding to class i + - :math:`\mathbf{X}^i` are samples in the dataset corresponding to class i Parameters ---------- @@ -140,7 +141,7 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no P0 : ndarray, shape (d, p) Initial starting point for projection. normalize : bool, optional - Normalise the Wasserstaiun distane by the average distance on P0 (default : False) + Normalise the Wasserstaiun distance by the average distance on P0 (default : False) verbose : int, optional Print information along iterations. @@ -151,6 +152,8 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no proj : callable Projection function including mean centering. + + .. _references-wda: References ---------- .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016). @@ -217,27 +220,28 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no def projection_robust_wasserstein(X, Y, a, b, tau, U0=None, reg=0.1, k=2, stopThr=1e-3, maxiter=100, verbose=0): r""" - Projection Robust Wasserstein Distance [32] + Projection Robust Wasserstein Distance :ref:`[32] ` The function solves the following optimization problem: .. math:: - \max_{U \in St(d, k)} \min_{\pi \in \Pi(\mu,\nu)} \sum_{i,j} \pi_{i,j} \|U^T(x_i - y_j)\|^2 - reg * H(\pi) + \max_{U \in St(d, k)} \ \min_{\pi \in \Pi(\mu,\nu)} \quad \sum_{i,j} \pi_{i,j} + \|U^T(\mathbf{x}_i - \mathbf{y}_j)\|^2 - \mathrm{reg} \cdot H(\pi) - - :math:`U` is a linear projection operator in the Stiefel(d, k) manifold + - :math:`U` is a linear projection operator in the Stiefel(`d`, `k`) manifold - :math:`H(\pi)` is entropy regularizer - - :math:`x_i`, :math:`y_j` are samples of measures \mu and \nu respectively + - :math:`\mathbf{x}_i`, :math:`\mathbf{y}_j` are samples of measures :math:`\mu` and :math:`\nu` respectively Parameters ---------- X : ndarray, shape (n, d) - Samples from measure \mu + Samples from measure :math:`\mu` Y : ndarray, shape (n, d) - Samples from measure \nu + Samples from measure :math:`\nu` a : ndarray, shape (n, ) - weights for measure \mu + weights for measure :math:`\mu` b : ndarray, shape (n, ) - weights for measure \nu + weights for measure :math:`\nu` tau : float stepsize for Riemannian Gradient Descent U0 : ndarray, shape (d, p) @@ -258,6 +262,8 @@ def projection_robust_wasserstein(X, Y, a, b, tau, U0=None, reg=0.1, k=2, stopTh U : ndarray, shape (d, k) Projection operator. + + .. _references-projection-robust-wasserstein: References ---------- .. [32] Huang, M. , Ma S. & Lai L. (2021). diff --git a/ot/gromov.py b/ot/gromov.py index a0fbf48..465693d 100644 --- a/ot/gromov.py +++ b/ot/gromov.py @@ -327,7 +327,8 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs The function solves the following optimization problem: .. math:: - \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} Where : @@ -410,7 +411,8 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg The function solves the following optimization problem: .. math:: - GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} Where : @@ -487,8 +489,8 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, Computes the FGW transport between two graphs (see :ref:`[24] `) .. math:: - \gamma = \mathop{\arg \min}_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l} - L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} @@ -569,7 +571,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 Computes the FGW distance between two graphs see (see :ref:`[24] `) .. math:: - \min_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l} + \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p} @@ -591,9 +593,9 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 M : array-like, shape (ns, nt) Metric cost matrix between features across domains C1 : array-like, shape (ns, ns) - Metric cost matrix respresentative of the structure in the source space. + Metric cost matrix representative of the structure in the source space. C2 : array-like, shape (nt, nt) - Metric cost matrix espresentative of the structure in the target space. + Metric cost matrix representative of the structure in the target space. p : array-like, shape (ns,) Distribution in the source space. q : array-like, shape (nt,) @@ -612,8 +614,8 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 Returns ------- - gamma : array-like, shape (ns, nt) - Optimal transportation matrix for the given parameters. + fgw-distance : float + Fused gromov wasserstein distance for the given parameters. log : dict Log dictionary return only if log==True in parameters. @@ -780,7 +782,8 @@ def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun, The function solves the following optimization problem: .. math:: - \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} @@ -901,7 +904,8 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun, The function solves the following optimization problem: .. math:: - \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} + \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} @@ -1052,7 +1056,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon, The function solves the following optimization problem: .. math:: - \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) + \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p} @@ -1157,7 +1161,8 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon, The function solves the following optimization problem: .. math:: - GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) + GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) + \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T})) Where : @@ -1223,7 +1228,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon, .. math:: - \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) + \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) Where : @@ -1336,7 +1341,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, .. math:: - \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) + \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s) Where : diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 2c18a88..5da897d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -62,7 +62,7 @@ def center_ot_dual(alpha0, beta0, a=None, b=None): is the following: .. math:: - \alpha^T a= \beta^T b + \alpha^T \mathbf{a} = \beta^T \mathbf{b} in addition to the OT problem constraints. @@ -70,11 +70,11 @@ def center_ot_dual(alpha0, beta0, a=None, b=None): a constant from both :math:`\alpha_0` and :math:`\beta_0`. .. math:: - c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta} + c &= \frac{\beta_0^T \mathbf{b} - \alpha_0^T \mathbf{a}}{\mathbf{1}^T \mathbf{b} + \mathbf{1}^T \mathbf{a}} - \alpha=\alpha_0+c + \alpha &= \alpha_0 + c - \beta=\beta0+c + \beta &= \beta_0 + c Parameters ---------- @@ -117,7 +117,7 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): The feasible values are computed efficiently but rather coarsely. .. warning:: - This function is necessary because the C++ solver in emd_c + This function is necessary because the C++ solver in `emd_c` discards all samples in the distributions with zeros weights. This means that while the primal variable (transport matrix) is exact, the solver only returns feasible dual potentials @@ -126,26 +126,26 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): First we compute the constraints violations: .. math:: - V=\alpha+\beta^T-M + \mathbf{V} = \alpha + \beta^T - \mathbf{M} - Next we compute the max amount of violation per row (alpha) and - columns (beta) + Next we compute the max amount of violation per row (:math:`\alpha`) and + columns (:math:`beta`) .. math:: - v^a_i=\max_j V_{i,j} + \mathbf{v^a}_i = \max_j \mathbf{V}_{i,j} - v^b_j=\max_i V_{i,j} + \mathbf{v^b}_j = \max_i \mathbf{V}_{i,j} Finally we update the dual potential with 0 weights if a constraint is violated .. math:: - \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0 + \alpha_i = \alpha_i - \mathbf{v^a}_i \quad \text{ if } \mathbf{a}_i=0 \text{ and } \mathbf{v^a}_i>0 - \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 + \beta_j = \beta_j - \mathbf{v^b}_j \quad \text{ if } \mathbf{b}_j=0 \text{ and } \mathbf{v^b}_j > 0 In the end the dual potentials are centered using function - :ref:`center_ot_dual`. + :py:func:`ot.lp.center_ot_dual`. Note that all those updates do not change the objective value of the solution but provide dual potentials that do not violate the constraints. @@ -201,26 +201,28 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True, numThreads=1): r"""Solves the Earth Movers distance problem and returns the OT matrix - .. math:: \gamma = arg\min_\gamma <\gamma,M>_F + .. math:: + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + s.t. \ \gamma \mathbf{1} = \mathbf{a} - s.t. \gamma 1 = a + \gamma^T \mathbf{1} = \mathbf{b} - \gamma^T 1= b + \gamma \geq 0 - \gamma\geq 0 where : - - M is the metric cost matrix - - a and b are the sample weights + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights - .. warning:: Note that the M matrix in numpy needs to be a C-order + .. warning:: Note that the :math:`\mathbf{M}` matrix in numpy needs to be a C-order numpy.array in float64 format. It will be converted if not in this format .. note:: This function is backend-compatible and will work on arrays from all compatible backends. - Uses the algorithm proposed in [1]_ + Uses the algorithm proposed in :ref:`[1] `. Parameters ---------- @@ -267,17 +269,19 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True, numThreads=1): array([[0.5, 0. ], [0. , 0.5]]) + + .. _references-emd: References ---------- - .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W. (2011, December). Displacement interpolation using Lagrangian mass transport. In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p. 158). ACM. See Also -------- - ot.bregman.sinkhorn : Entropic regularized OT ot.optim.cg : General - regularized OT""" + ot.bregman.sinkhorn : Entropic regularized OT + ot.optim.cg : General regularized OT + """ # convert to numpy if list a, b, M = list_to_array(a, b, M) @@ -340,22 +344,23 @@ def emd2(a, b, M, processes=1, r"""Solves the Earth Movers distance problem and returns the loss .. math:: - \min_\gamma <\gamma,M>_F + \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} + + \gamma \geq 0 - \gamma\geq 0 where : - - M is the metric cost matrix - - a and b are the sample weights + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights .. note:: This function is backend-compatible and will work on arrays from all compatible backends. - Uses the algorithm proposed in [1]_ + Uses the algorithm proposed in :ref:`[1] `. Parameters ---------- @@ -405,9 +410,10 @@ def emd2(a, b, M, processes=1, >>> ot.emd2(a,b,M) 0.0 + + .. _references-emd2: References ---------- - .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W. (2011, December). Displacement interpolation using Lagrangian mass transport. In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p. @@ -416,7 +422,8 @@ def emd2(a, b, M, processes=1, See Also -------- ot.bregman.sinkhorn : Entropic regularized OT - ot.optim.cg : General regularized OT""" + ot.optim.cg : General regularized OT + """ a, b, M = list_to_array(a, b, M) @@ -508,29 +515,35 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance), formally: .. math:: - \min_X \sum_{i=1}^N w_i W_2^2(b, X, a_i, X_i) + \min_\mathbf{X} \quad \sum_{i=1}^N w_i W_2^2(\mathbf{b}, \mathbf{X}, \mathbf{a}_i, \mathbf{X}_i) where : - :math:`w \in \mathbb{(0, 1)}^{N}`'s are the barycenter weights and sum to one - - the :math:`a_i \in \mathbb{R}^{k_i}` are the empirical measures weights and sum to one for each :math:`i` - - the :math:`X_i \in \mathbb{R}^{k_i, d}` are the empirical measures atoms locations - - :math:`b \in \mathbb{R}^{k}` is the desired weights vector of the barycenter + - the :math:`\mathbf{a}_i \in \mathbb{R}^{k_i}` are the empirical measures weights and sum to one for each :math:`i` + - the :math:`\mathbf{X}_i \in \mathbb{R}^{k_i, d}` are the empirical measures atoms locations + - :math:`\mathbf{b} \in \mathbb{R}^{k}` is the desired weights vector of the barycenter - This problem is considered in [1] (Algorithm 2). There are two differences with the following codes: + This problem is considered in :ref:`[1] ` (Algorithm 2). + There are two differences with the following codes: - we do not optimize over the weights - - we do not do line search for the locations updates, we use i.e. theta = 1 in [1] (Algorithm 2). This can be seen as a discrete implementation of the fixed-point algorithm of [2] proposed in the continuous setting. + - we do not do line search for the locations updates, we use i.e. :math:`\theta = 1` in + :ref:`[1] ` (Algorithm 2). This can be seen as a discrete + implementation of the fixed-point algorithm of + :ref:`[2] ` proposed in the continuous setting. Parameters ---------- measures_locations : list of N (k_i,d) numpy.ndarray - The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list) + The discrete support of a measure supported on :math:`k_i` locations of a `d`-dimensional space + (:math:`k_i` can be different for each element of the list) measures_weights : list of N (k_i,) numpy.ndarray - Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure + Numpy arrays where each numpy array has :math:`k_i` non-negatives values summing to one + representing the weights of each discrete input measure X_init : (k,d) np.ndarray - Initialization of the support locations (on k atoms) of the barycenter + Initialization of the support locations (on `k` atoms) of the barycenter b : (k,) np.ndarray Initialization of the weights of the barycenter (non-negatives, sum to 1) weights : (N,) np.ndarray @@ -554,9 +567,10 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None X : (k,d) np.ndarray Support locations (on k atoms) of the barycenter + + .. _references-free-support-barycenter: References ---------- - .. [1] Cuturi, Marco, and Arnaud Doucet. "Fast computation of Wasserstein barycenters." International Conference on Machine Learning. 2014. .. [2] Álvarez-Esteban, Pedro C., et al. "A fixed-point approach to barycenters in Wasserstein space." Journal of Mathematical Analysis and Applications 441.2 (2016): 744-762. diff --git a/ot/optim.py b/ot/optim.py index 6456c03..cc286b6 100644 --- a/ot/optim.py +++ b/ot/optim.py @@ -162,7 +162,8 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot f(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot f(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -309,7 +310,8 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10, The function solves the following optimization problem: .. math:: - \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma) s.t. \ \gamma \mathbf{1} &= \mathbf{a} @@ -452,7 +454,7 @@ def solve_1d_linesearch_quad(a, b, c): .. math:: - \mathop{\arg \min}_{0 \leq x \leq 1} f(x) = ax^{2} + bx + c + \mathop{\arg \min}_{0 \leq x \leq 1} \quad f(x) = ax^{2} + bx + c Parameters ---------- diff --git a/ot/partial.py b/ot/partial.py index 814d779..b7093e4 100755 --- a/ot/partial.py +++ b/ot/partial.py @@ -20,13 +20,16 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, The function considers the following problem: .. math:: - \gamma = \arg\min_\gamma <\gamma,(M-\lambda)>_F + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, (\mathbf{M} - \lambda) \rangle_F - s.t. - \gamma\geq 0 \\ - \gamma 1 \leq a\\ - \gamma^T 1 \leq b\\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + .. math:: + s.t. \ \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} + + \gamma &\geq 0 + + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} or equivalently (see Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. @@ -34,33 +37,32 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, metrics. Foundations of Computational Mathematics, 18(1), 1-44.) .. math:: - \gamma = \arg\min_\gamma <\gamma,M>_F + \sqrt(\lambda/2) - (\|\gamma 1 - a\|_1 + \|\gamma^T 1 - b\|_1) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \sqrt{\frac{\lambda}{2} (\|\gamma \mathbf{1} - \mathbf{a}\|_1 + \|\gamma^T \mathbf{1} - \mathbf{b}\|_1)} - s.t. - \gamma\geq 0 \\ + s.t. \ \gamma \geq 0 where : - - M is the metric cost matrix - - a and b are source and target unbalanced distributions - - :math:`\lambda` is the lagragian cost. Tuning its value allows attaining - a given mass to be transported m + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions + - :math:`\lambda` is the lagrangian cost. Tuning its value allows attaining + a given mass to be transported `m` - The formulation of the problem has been proposed in [28]_ + The formulation of the problem has been proposed in :ref:`[28] ` Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) - Unnormalized histograms of dimension dim_b + Unnormalized histograms of dimension `dim_b` M : np.ndarray (dim_a, dim_b) cost matrix for the quadratic cost reg_m : float, optional - Lagragian cost + Lagrangian cost nb_dummies : int, optional, default:1 number of reservoir points to be added (to avoid numerical instabilities, increase its value if an error is raised) @@ -69,6 +71,7 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, **kwargs : dict parameters can be directly passed to the emd solver + .. warning:: When dealing with a large number of points, the EMD solver may face some instabilities, especially when the mass associated to the dummy @@ -77,7 +80,7 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, Returns ------- - gamma : (dim_a x dim_b) ndarray + gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary returned only if `log` is `True` @@ -97,9 +100,10 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, array([[0.1, 0. ], [0. , 0. ]]) + + .. _references-partial-wasserstein-lagrange: References ---------- - .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in optimal transport and Monge-Ampere obstacle problems. Annals of mathematics, 673-730. @@ -162,27 +166,30 @@ def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): The function considers the following problem: .. math:: - \gamma = \arg\min_\gamma <\gamma,M>_F + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + .. math:: + s.t. \ \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} + + \gamma &\geq 0 - s.t. - \gamma\geq 0 \\ - \gamma 1 \leq a\\ - \gamma^T 1 \leq b\\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - M is the metric cost matrix - - a and b are source and target unbalanced distributions - - m is the amount of mass to be transported + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions + - `m` is the amount of mass to be transported Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) - Unnormalized histograms of dimension dim_b + Unnormalized histograms of dimension `dim_b` M : np.ndarray (dim_a, dim_b) cost matrix for the quadratic cost m : float, optional @@ -205,7 +212,7 @@ def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): Returns ------- - :math:`gamma` : (dim_a x dim_b) ndarray + gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary returned only if `log` is `True` @@ -278,27 +285,30 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): The function considers the following problem: .. math:: - \gamma = \arg\min_\gamma <\gamma,M>_F + \gamma = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F - s.t. - \gamma\geq 0 \\ - \gamma 1 \leq a\\ - \gamma^T 1 \leq b\\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + .. math:: + s.t. \ \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} + + \gamma &\geq 0 + + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - M is the metric cost matrix - - a and b are source and target unbalanced distributions - - m is the amount of mass to be transported + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions + - `m` is the amount of mass to be transported Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) - Unnormalized histograms of dimension dim_b + Unnormalized histograms of dimension `dim_b` M : np.ndarray (dim_a, dim_b) cost matrix for the quadratic cost m : float, optional @@ -321,8 +331,8 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): Returns ------- - :math:`gamma` : (dim_a x dim_b) ndarray - Optimal transportation matrix for the given parameters + GW: float + partial GW discrepancy log : dict log dictionary returned only if `log` is `True` @@ -360,8 +370,8 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): def gwgrad_partial(C1, C2, T): - """Compute the GW gradient. Note: we can not use the trick in [12]_ as - the marginals may not sum to 1. + """Compute the GW gradient. Note: we can not use the trick in :ref:`[12] ` + as the marginals may not sum to 1. Parameters ---------- @@ -379,6 +389,8 @@ def gwgrad_partial(C1, C2, T): numpy.array of shape (n_p+nb_dummies, n_u) gradient + + .. _references-gwgrad-partial: References ---------- .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, @@ -425,22 +437,25 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None, The function considers the following problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + .. math:: + s.t. \ \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} - s.t. \gamma 1 \leq a \\ - \gamma^T 1 \leq b \\ - \gamma\geq 0 \\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + \gamma &\geq 0 + + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - M is the metric cost matrix - - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma) - =\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are the sample weights - - m is the amount of mass to be transported + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma) = \sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights + - `m` is the amount of mass to be transported - The formulation of the problem has been proposed in [29]_ + The formulation of the problem has been proposed in :ref:`[29] ` Parameters @@ -454,7 +469,7 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None, q : ndarray, shape (nt,) Distribution in the target space m : float, optional - Amount of mass to be transported (default: min (|p|_1, |q|_1)) + Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`) nb_dummies : int, optional Number of dummy points to add (avoid instabilities in the EMD solver) G0 : ndarray, shape (ns, nt), optional @@ -476,7 +491,7 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None, Returns ------- - gamma : (dim_a x dim_b) ndarray + gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary returned only if `log` is `True` @@ -503,6 +518,8 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None, [0. , 0. , 0.25, 0. ], [0. , 0. , 0. , 0. ]]) + + .. _references-partial-gromov-wasserstein: References ---------- .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal @@ -597,22 +614,25 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None, The function considers the following problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + GW = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + .. math:: + s.t. \ \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} + + \gamma &\geq 0 - s.t. \gamma 1 \leq a \\ - \gamma^T 1 \leq b \\ - \gamma\geq 0 \\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - M is the metric cost matrix - - :math:`\Omega` is the entropic regularization term - :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are the sample weights - - m is the amount of mass to be transported + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma) = \sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights + - `m` is the amount of mass to be transported - The formulation of the problem has been proposed in [29]_ + The formulation of the problem has been proposed in :ref:`[29] ` Parameters @@ -626,7 +646,7 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None, q : ndarray, shape (nt,) Distribution in the target space m : float, optional - Amount of mass to be transported (default: min (|p|_1, |q|_1)) + Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`) nb_dummies : int, optional Number of dummy points to add (avoid instabilities in the EMD solver) G0 : ndarray, shape (ns, nt), optional @@ -655,7 +675,7 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None, Returns ------- - partial_gw_dist : (dim_a x dim_b) ndarray + partial_gw_dist : float partial GW discrepancy log : dict log dictionary returned only if `log` is `True` @@ -676,6 +696,8 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None, >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b, m=0.25),2) 0.0 + + .. _references-partial-gromov-wasserstein2: References ---------- .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal @@ -706,30 +728,29 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000, The function considers the following problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \gamma 1 \leq a \\ - \gamma^T 1 \leq b \\ - \gamma\geq 0 \\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + s.t. \gamma \mathbf{1} &\leq \mathbf{a} \\ + \gamma^T \mathbf{1} &\leq \mathbf{b} \\ + \gamma &\geq 0 \\ + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} \\ where : - - M is the metric cost matrix - - :math:`\Omega` is the entropic regularization term - :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are the sample weights - - m is the amount of mass to be transported + - :math:`\mathbf{M}` is the metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights + - `m` is the amount of mass to be transported - The formulation of the problem has been proposed in [3]_ (prop. 5) + The formulation of the problem has been proposed in :ref:`[3] ` (prop. 5) Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) - Unnormalized histograms of dimension dim_b + Unnormalized histograms of dimension `dim_b` M : np.ndarray (dim_a, dim_b) cost matrix reg : float @@ -748,7 +769,7 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000, Returns ------- - gamma : (dim_a x dim_b) ndarray + gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary returned only if `log` is `True` @@ -764,6 +785,8 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000, array([[0.06, 0.02], [0.01, 0. ]]) + + .. _references-entropic-partial-wasserstein: References ---------- .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. @@ -838,32 +861,34 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None, numItermax=1000, tol=1e-7, log=False, verbose=False): r""" - Returns the partial Gromov-Wasserstein transport between (C1,p) and (C2,q) + Returns the partial Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: - GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot - \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_{\gamma} \quad \sum_{i,j,k,l} + L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})\cdot + \gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma) + + .. math:: + s.t. \ \gamma &\geq 0 + + \gamma \mathbf{1} &\leq \mathbf{a} - s.t. - \gamma\geq 0 \\ - \gamma 1 \leq a\\ - \gamma^T 1 \leq b\\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + \gamma^T \mathbf{1} &\leq \mathbf{b} + + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - C1 is the metric cost matrix in the source space - - C2 is the metric cost matrix in the target space - - p and q are the sample weights - - L : quadratic loss function - - :math:`\Omega` is the entropic regularization term - :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - m is the amount of mass to be transported + - :math:`\mathbf{C_1}` is the metric cost matrix in the source space + - :math:`\mathbf{C_2}` is the metric cost matrix in the target space + - :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights + - `L`: quadratic loss function + - :math:`\Omega` is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - `m` is the amount of mass to be transported - The formulation of the GW problem has been proposed in [12]_ and the - partial GW in [29]_. + The formulation of the GW problem has been proposed in :ref:`[12] ` and the partial GW in :ref:`[29] ` Parameters ---------- @@ -878,7 +903,7 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None, reg: float entropic regularization parameter m : float, optional - Amount of mass to be transported (default: min (|p|_1, |q|_1)) + Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`) G0 : ndarray, shape (ns, nt), optional Initialisation of the transportation matrix numItermax : int, optional @@ -913,17 +938,20 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None, Returns ------- - :math: `gamma` : (dim_a x dim_b) ndarray + :math: `gamma` : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary returned only if `log` is `True` + + .. _references-entropic-partial-gromov-wassertein: References ---------- .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. - .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal + + .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal Transport with Applications on Positive-Unlabeled Learning". NeurIPS. @@ -977,33 +1005,33 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None, numItermax=1000, tol=1e-7, log=False, verbose=False): r""" - Returns the partial Gromov-Wasserstein discrepancy between (C1,p) and - (C2,q) + Returns the partial Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` The function solves the following optimization problem: .. math:: - GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot - \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma) + GW = \min_{\gamma} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})\cdot + \gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma) + + .. math:: + s.t. \ \gamma &\geq 0 + + \gamma \mathbf{1} &\leq \mathbf{a} + + \gamma^T \mathbf{1} &\leq \mathbf{b} - s.t. - \gamma\geq 0 \\ - \gamma 1 \leq a\\ - \gamma^T 1 \leq b\\ - 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} where : - - C1 is the metric cost matrix in the source space - - C2 is the metric cost matrix in the target space - - p and q are the sample weights - - L : quadratic loss function - - :math:`\Omega` is the entropic regularization term - :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - m is the amount of mass to be transported + - :math:`\mathbf{C_1}` is the metric cost matrix in the source space + - :math:`\mathbf{C_2}` is the metric cost matrix in the target space + - :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights + - `L` : quadratic loss function + - :math:`\Omega` is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - `m` is the amount of mass to be transported - The formulation of the GW problem has been proposed in [12]_ and the - partial GW in [29]_. + The formulation of the GW problem has been proposed in :ref:`[12] ` and the partial GW in :ref:`[29] ` Parameters @@ -1019,7 +1047,7 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None, reg: float entropic regularization parameter m : float, optional - Amount of mass to be transported (default: min (|p|_1, |q|_1)) + Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`) G0 : ndarray, shape (ns, nt), optional Initialisation of the transportation matrix numItermax : int, optional @@ -1052,11 +1080,14 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None, >>> np.round(entropic_partial_gromov_wasserstein2(C1, C2, a, b,50), 2) 1.87 + + .. _references-entropic-partial-gromov-wassertein2: References ---------- .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, "Gromov-Wasserstein averaging of kernel and distance matrices." International Conference on Machine Learning (ICML). 2016. + .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal Transport with Applications on Positive-Unlabeled Learning". NeurIPS. diff --git a/ot/plot.py b/ot/plot.py index ad436b4..3e3bed7 100644 --- a/ot/plot.py +++ b/ot/plot.py @@ -18,10 +18,10 @@ from matplotlib import gridspec def plot1D_mat(a, b, M, title=''): - """ Plot matrix M with the source and target 1D distribution + """ Plot matrix :math:`\mathbf{M}` with the source and target 1D distribution - Creates a subplot with the source distribution a on the left and - target distribution b on the tot. The matrix M is shown in between. + Creates a subplot with the source distribution :math:`\mathbf{a}` on the left and + target distribution :math:`\mathbf{b}` on the top. The matrix :math:`\mathbf{M}` is shown in between. Parameters @@ -61,10 +61,10 @@ def plot1D_mat(a, b, M, title=''): def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs): - """ Plot matrix M in 2D with lines using alpha values + """ Plot matrix :math:`\mathbf{G}` in 2D with lines using alpha values Plot lines between source and target 2D samples with a color - proportional to the value of the matrix G between samples. + proportional to the value of the matrix :math:`\mathbf{G}` between samples. Parameters diff --git a/ot/sliced.py b/ot/sliced.py index d3dc3f2..7c09111 100644 --- a/ot/sliced.py +++ b/ot/sliced.py @@ -17,7 +17,7 @@ from .utils import list_to_array def get_random_projections(d, n_projections, seed=None, backend=None, type_as=None): r""" - Generates n_projections samples from the uniform on the unit sphere of dimension d-1: :math:`\mathcal{U}(\mathcal{S}^{d-1})` + Generates n_projections samples from the uniform on the unit sphere of dimension :math:`d-1`: :math:`\mathcal{U}(\mathcal{S}^{d-1})` Parameters ---------- @@ -67,11 +67,12 @@ def sliced_wasserstein_distance(X_s, X_t, a=None, b=None, n_projections=50, p=2, Computes a Monte-Carlo approximation of the p-Sliced Wasserstein distance .. math:: - \mathcal{SWD}_p(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}[\mathcal{W}_p^p(\theta_\# \mu, \theta_\# \nu)]^{\frac{1}{p}} + \mathcal{SWD}_p(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}\left(\mathcal{W}_p^p(\theta_\# \mu, \theta_\# \nu)\right)^{\frac{1}{p}} + where : - - :math:`\theta_\# \mu` stands for the pushforwars of the projection :math:`\mathbb{R}^d \ni X \mapsto \langle \theta, X \rangle` + - :math:`\theta_\# \mu` stands for the pushforwards of the projection :math:`X \in \mathbb{R}^d \mapsto \langle \theta, X \rangle` Parameters diff --git a/ot/smooth.py b/ot/smooth.py index ea26bae..6855005 100644 --- a/ot/smooth.py +++ b/ot/smooth.py @@ -47,15 +47,24 @@ from scipy.optimize import minimize def projection_simplex(V, z=1, axis=None): - """ Projection of x onto the simplex, scaled by z + r""" Projection of :math:`\mathbf{V}` onto the simplex, scaled by `z` - P(x; z) = argmin_{y >= 0, sum(y) = z} ||y - x||^2 + .. math:: + P\left(\mathbf{V}, z\right) = \mathop{\arg \min}_{\substack{\mathbf{y} >= 0 \\ \sum_i \mathbf{y}_i = z}} \quad \|\mathbf{y} - \mathbf{V}\|^2 + + Parameters + ---------- + V: ndarray, rank 2 z: float or array - If array, len(z) must be compatible with V + If array, len(z) must be compatible with :math:`\mathbf{V}` axis: None or int - - axis=None: project V by P(V.ravel(); z) - - axis=1: project each V[i] by P(V[i]; z[i]) - - axis=0: project each V[:, j] by P(V[:, j]; z[j]) + - axis=None: project :math:`\mathbf{V}` by :math:`P(\mathbf{V}.\mathrm{ravel}(), z)` + - axis=1: project each :math:`\mathbf{V}_i` by :math:`P(\mathbf{V}_i, z_i)` + - axis=0: project each :math:`\mathbf{V}_{:, j}` by :math:`P(\mathbf{V}_{:, j}, z_j)` + + Returns + ------- + projection: ndarray, shape :math:`\mathbf{V}`.shape """ if axis == 1: n_features = V.shape[1] @@ -77,12 +86,12 @@ def projection_simplex(V, z=1, axis=None): class Regularization(object): - """Base class for Regularization objects + r"""Base class for Regularization objects Notes ----- - This class is not intended for direct use but as aparent for true - regularizatiojn implementation. + This class is not intended for direct use but as apparent for true + regularization implementation. """ def __init__(self, gamma=1.0): @@ -98,40 +107,48 @@ class Regularization(object): self.gamma = gamma def delta_Omega(X): - """ - Compute delta_Omega(X[:, j]) for each X[:, j]. - delta_Omega(x) = sup_{y >= 0} y^T x - Omega(y). + r""" + Compute :math:`\delta_\Omega(\mathbf{X}_{:, j})` for each :math:`\mathbf{X}_{:, j}`. + + .. math:: + \delta_\Omega(\mathbf{x}) = \sup_{\mathbf{y} >= 0} \ + \mathbf{y}^T \mathbf{x} - \Omega(\mathbf{y}) Parameters ---------- - X: array, shape = len(a) x len(b) + X: array, shape = (len(a), len(b)) Input array. Returns ------- - v: array, len(b) - Values: v[j] = delta_Omega(X[:, j]) - G: array, len(a) x len(b) - Gradients: G[:, j] = nabla delta_Omega(X[:, j]) + v: array, (len(b), ) + Values: :math:`\mathbf{v}_j = \delta_\Omega(\mathbf{X}_{:, j})` + G: array, (len(a), len(b)) + Gradients: :math:`\mathbf{G}_{:, j} = \nabla \delta_\Omega(\mathbf{X}_{:, j})` """ raise NotImplementedError def max_Omega(X, b): - """ - Compute max_Omega_j(X[:, j]) for each X[:, j]. - max_Omega_j(x) = sup_{y >= 0, sum(y) = 1} y^T x - Omega(b[j] y) / b[j]. + r""" + Compute :math:`\mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})` for each :math:`\mathbf{X}_{:, j}`. + + .. math:: + \mathrm{max}_{\Omega, j}(\mathbf{x}) = + \sup_{\substack{\mathbf{y} >= 0 \ \sum_i \mathbf{y}_i = 1}} + \mathbf{y}^T \mathbf{x} - \frac{1}{\mathbf{b}_j} \Omega(\mathbf{b}_j \mathbf{y}) Parameters ---------- - X: array, shape = len(a) x len(b) + X: array, shape = (len(a), len(b)) Input array. + b: array, shape = (len(b), ) Returns ------- - v: array, len(b) - Values: v[j] = max_Omega_j(X[:, j]) - G: array, len(a) x len(b) - Gradients: G[:, j] = nabla max_Omega_j(X[:, j]) + v: array, (len(b), ) + Values: :math:`\mathbf{v}_j = \mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})` + G: array, (len(a), len(b)) + Gradients: :math:`\mathbf{G}_{:, j} = \nabla \mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})` """ raise NotImplementedError @@ -192,7 +209,7 @@ class SquaredL2(Regularization): def dual_obj_grad(alpha, beta, a, b, C, regul): - """ + r""" Compute objective value and gradients of dual objective. Parameters @@ -203,19 +220,19 @@ def dual_obj_grad(alpha, beta, a, b, C, regul): a: array, shape = len(a) b: array, shape = len(b) Input histograms (should be non-negative and sum to 1). - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a delta_Omega(X) method. + Should implement a `delta_Omega(X)` method. Returns ------- obj: float Objective value (higher is better). grad_alpha: array, shape = len(a) - Gradient w.r.t. alpha. + Gradient w.r.t. `alpha`. grad_beta: array, shape = len(b) - Gradient w.r.t. beta. + Gradient w.r.t. `beta`. """ obj = np.dot(alpha, a) + np.dot(beta, b) grad_alpha = a.copy() @@ -242,13 +259,13 @@ def solve_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500, Parameters ---------- - a: array, shape = len(a) - b: array, shape = len(b) + a: array, shape = (len(a), ) + b: array, shape = (len(b), ) Input histograms (should be non-negative and sum to 1). - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a delta_Omega(X) method. + Should implement a `delta_Omega(X)` method. method: str Solver to be used (passed to `scipy.optimize.minimize`). tol: float @@ -258,8 +275,8 @@ def solve_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500, Returns ------- - alpha: array, shape = len(a) - beta: array, shape = len(b) + alpha: array, shape = (len(a), ) + beta: array, shape = (len(b), ) Dual potentials. """ @@ -302,10 +319,10 @@ def semi_dual_obj_grad(alpha, a, b, C, regul): a: array, shape = len(a) b: array, shape = len(b) Input histograms (should be non-negative and sum to 1). - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a max_Omega(X) method. + Should implement a `max_Omega(X)` method. Returns ------- @@ -337,13 +354,13 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500, Parameters ---------- - a: array, shape = len(a) - b: array, shape = len(b) + a: array, shape = (len(a), ) + b: array, shape = (len(b), ) Input histograms (should be non-negative and sum to 1). - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a max_Omega(X) method. + Should implement a `max_Omega(X)` method. method: str Solver to be used (passed to `scipy.optimize.minimize`). tol: float @@ -353,7 +370,7 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500, Returns ------- - alpha: array, shape = len(a) + alpha: array, shape = (len(a), ) Semi-dual potentials. """ @@ -371,7 +388,7 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500, def get_plan_from_dual(alpha, beta, C, regul): - """ + r""" Retrieve optimal transportation plan from optimal dual potentials. Parameters @@ -379,14 +396,14 @@ def get_plan_from_dual(alpha, beta, C, regul): alpha: array, shape = len(a) beta: array, shape = len(b) Optimal dual potentials. - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a delta_Omega(X) method. + Should implement a `delta_Omega(X)` method. Returns ------- - T: array, shape = len(a) x len(b) + T: array, shape = (len(a), len(b)) Optimal transportation plan. """ X = alpha[:, np.newaxis] + beta - C @@ -394,7 +411,7 @@ def get_plan_from_dual(alpha, beta, C, regul): def get_plan_from_semi_dual(alpha, b, C, regul): - """ + r""" Retrieve optimal transportation plan from optimal semi-dual potentials. Parameters @@ -403,14 +420,14 @@ def get_plan_from_semi_dual(alpha, b, C, regul): Optimal semi-dual potentials. b: array, shape = len(b) Second input histogram (should be non-negative and sum to 1). - C: array, shape = len(a) x len(b) + C: array, shape = (len(a), len(b)) Ground cost matrix. regul: Regularization object - Should implement a delta_Omega(X) method. + Should implement a `delta_Omega(X)` method. Returns ------- - T: array, shape = len(a) x len(b) + T: array, shape = (len(a), len(b)) Optimal transportation plan. """ X = alpha[:, np.newaxis] - C @@ -422,19 +439,21 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9, r""" Solve the regularized OT problem in the dual and return the OT matrix - The function solves the smooth relaxed dual formulation (7) in [17]_ : + The function solves the smooth relaxed dual formulation (7) in + :ref:`[17] `: .. math:: - \max_{\alpha,\beta}\quad a^T\alpha+b^T\beta-\sum_j\delta_\Omega(\alpha+\beta_j-\mathbf{m}_j) + \max_{\alpha,\beta}\quad \mathbf{a}^T\alpha + \mathbf{b}^T\beta - + \sum_j \delta_\Omega \left(\alpha+\beta_j-\mathbf{m}_j \right) where : - - :math:`\mathbf{m}_j` is the jth column of the cost matrix + - :math:`\mathbf{m}_j` is the j-th column of the cost matrix - :math:`\delta_\Omega` is the convex conjugate of the regularization term :math:`\Omega` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The OT matrix can is reconstructed from the gradient of :math:`\delta_\Omega` - (See [17]_ Proposition 1). + (See :ref:`[17] ` Proposition 1). The optimization algorithm is using gradient decent (L-BFGS by default). @@ -444,15 +463,19 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9, samples weights in the source domain b : np.ndarray (nt,) or np.ndarray (nt,nbb) samples in the target domain, compute sinkhorn with multiple targets - and fixed M if b is a matrix (return OT loss + dual variables in log) + and fixed :math:`\mathbf{M}` if :math:`\mathbf{b}` is a matrix + (return OT loss + dual variables in log) M : np.ndarray (ns,nt) loss matrix reg : float Regularization term >0 reg_type : str - Regularization type, can be the following (default ='l2'): - - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_) - - 'l2' : Squared Euclidean regularization + Regularization type, can be the following (default ='l2'): + + - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn + :ref:`[2] `) + + - 'l2' : Squared Euclidean regularization method : str Solver to use for scipy.optimize.minimize numItermax : int, optional @@ -467,15 +490,15 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9, Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters + .. _references-smooth-ot-dual: References ---------- - .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS). @@ -514,21 +537,23 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr= r""" Solve the regularized OT problem in the semi-dual and return the OT matrix - The function solves the smooth relaxed dual formulation (10) in [17]_ : + The function solves the smooth relaxed dual formulation (10) in + :ref:`[17] `: .. math:: - \max_{\alpha}\quad a^T\alpha-OT_\Omega^*(\alpha,b) + \max_{\alpha}\quad \mathbf{a}^T\alpha- \mathrm{OT}_\Omega^*(\alpha, \mathbf{b}) where : .. math:: - OT_\Omega^*(\alpha,b)=\sum_j b_j + \mathrm{OT}_\Omega^*(\alpha,b)=\sum_j \mathbf{b}_j - - :math:`\mathbf{m}_j` is the jth column of the cost matrix - - :math:`OT_\Omega^*(\alpha,b)` is defined in Eq. (9) in [17] - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{m}_j` is the j-th column of the cost matrix + - :math:`\mathrm{OT}_\Omega^*(\alpha,b)` is defined in Eq. (9) in + :ref:`[17] ` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) - The OT matrix can is reconstructed using [17]_ Proposition 2. + The OT matrix can is reconstructed using :ref:`[17] ` Proposition 2. The optimization algorithm is using gradient decent (L-BFGS by default). @@ -538,15 +563,19 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr= samples weights in the source domain b : np.ndarray (nt,) or np.ndarray (nt,nbb) samples in the target domain, compute sinkhorn with multiple targets - and fixed M if b is a matrix (return OT loss + dual variables in log) + and fixed:math:`\mathbf{M}` if :math:`\mathbf{b}` is a matrix + (return OT loss + dual variables in log) M : np.ndarray (ns,nt) loss matrix reg : float Regularization term >0 reg_type : str - Regularization type, can be the following (default ='l2'): - - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_) - - 'l2' : Squared Euclidean regularization + Regularization type, can be the following (default ='l2'): + + - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn + :ref:`[2] `) + + - 'l2' : Squared Euclidean regularization method : str Solver to use for scipy.optimize.minimize numItermax : int, optional @@ -561,15 +590,15 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr= Returns ------- - gamma : (ns x nt) ndarray + gamma : (ns, nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters + .. _references-smooth-ot-semi-dual: References ---------- - .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS). diff --git a/ot/stochastic.py b/ot/stochastic.py index 13ed9cc..693675f 100644 --- a/ot/stochastic.py +++ b/ot/stochastic.py @@ -18,22 +18,25 @@ import numpy as np def coordinate_grad_semi_dual(b, M, reg, beta, i): r''' - Compute the coordinate gradient update for regularized discrete distributions for (i, :) + Compute the coordinate gradient update for regularized discrete distributions for :math:`(i, :)` The function computes the gradient of the semi dual problem: .. math:: - \max_v \sum_i (\sum_j v_j * b_j - reg * log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i + \max_\mathbf{v} \ \sum_i \mathbf{a}_i \left[ \sum_j \mathbf{v}_j \mathbf{b}_j - \mathrm{reg} + \cdot \log \left( \sum_j \mathbf{b}_j + \exp \left( \frac{\mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} + \right) \right) \right] Where : - - M is the (ns,nt) metric cost matrix - - v is a dual variable in R^J + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix + - :math:`\mathbf{v}` is a dual variable in :math:`\mathbb{R}^{nt}` - reg is the regularization term - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the problem is the ASGD & SAG algorithms - as proposed in [18]_ [alg.1 & alg.2] + as proposed in :ref:`[18] ` [alg.1 & alg.2] Parameters @@ -47,7 +50,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i): v : ndarray, shape (nt,) Dual variable. i : int - Picked number i. + Picked number `i`. Returns ------- @@ -74,12 +77,10 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i): [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]]) + .. _references-coordinate-grad-semi-dual: References ---------- - [Genevay et al., 2016] : - Stochastic Optimization for Large-scale Optimal Transport, - Advances in Neural Information Processing Systems (2016), - arXiv preprint arxiv:1605.08527. + .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016). ''' r = M[i, :] - beta exp_beta = np.exp(-r / reg) * b @@ -88,29 +89,29 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i): def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None): - r''' - Compute the SAG algorithm to solve the regularized discrete measures - optimal transport max problem + r""" + Compute the SAG algorithm to solve the regularized discrete measures optimal transport max problem The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1 = b + \gamma^T \mathbf{1} = \mathbf{b} \gamma \geq 0 Where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the problem is the SAG algorithm - as proposed in [18]_ [alg.1] + as proposed in :ref:`[18] ` [alg.1] Parameters @@ -131,7 +132,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None): Returns ------- - v : ndarray, shape (nt,) + v : ndarray, shape (`nt`,) Dual variable. Examples @@ -154,14 +155,12 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None): [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01], [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]]) + + .. _references-sag-entropic-transport: References ---------- - - [Genevay et al., 2016] : - Stochastic Optimization for Large-scale Optimal Transport, - Advances in Neural Information Processing Systems (2016), - arXiv preprint arxiv:1605.08527. - ''' + .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016). + """ if lr is None: lr = 1. / max(a / reg) @@ -187,22 +186,23 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None): The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg}\cdot\Omega(\gamma) - s.t. \gamma 1 = a + s.t. \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} \gamma \geq 0 Where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the problem is the ASGD algorithm - as proposed in [18]_ [alg.2] + as proposed in :ref:`[18] ` [alg.2] Parameters @@ -220,7 +220,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None): Returns ------- - ave_v : ndarray, shape (nt,) + ave_v : ndarray, shape (`nt`,) dual variable Examples @@ -243,13 +243,11 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None): [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01], [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]]) + + .. _references-averaged-sgd-entropic-transport: References ---------- - - [Genevay et al., 2016] : - Stochastic Optimization for Large-scale Optimal Transport, - Advances in Neural Information Processing Systems (2016), - arXiv preprint arxiv:1605.08527. + .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016). ''' if lr is None: @@ -271,20 +269,21 @@ def c_transform_entropic(b, M, reg, beta): r''' The goal is to recover u from the c-transform. - The function computes the c_transform of a dual variable from the other + The function computes the c-transform of a dual variable from the other dual variable: .. math:: - u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j + \mathbf{u} = \mathbf{v}^{c,reg} = - \mathrm{reg} \sum_j \mathbf{b}_j + \exp\left( \frac{\mathbf{v} - \mathbf{M}}{\mathrm{reg}} \right) Where : - - M is the (ns,nt) metric cost matrix - - u, v are dual variables in R^IxR^J + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix + - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}` - reg is the regularization term It is used to recover an optimal u from optimal v solving the semi dual - problem, see Proposition 2.1 of [18]_ + problem, see Proposition 2.1 of :ref:`[18] ` Parameters @@ -300,7 +299,7 @@ def c_transform_entropic(b, M, reg, beta): Returns ------- - u : ndarray, shape (ns,) + u : ndarray, shape (`ns`,) Dual variable. Examples @@ -323,13 +322,11 @@ def c_transform_entropic(b, M, reg, beta): [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01], [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]]) + + .. _references-c-transform-entropic: References ---------- - - [Genevay et al., 2016] : - Stochastic Optimization for Large-scale Optimal Transport, - Advances in Neural Information Processing Systems (2016), - arXiv preprint arxiv:1605.08527. + .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016). ''' n_source = np.shape(M)[0] @@ -345,27 +342,28 @@ def c_transform_entropic(b, M, reg, beta): def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None, log=False): r''' - Compute the transportation matrix to solve the regularized discrete - measures optimal transport max problem + Compute the transportation matrix to solve the regularized discrete measures optimal transport max problem The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} \gamma \geq 0 Where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) + The algorithm used for solving the problem is the SAG or ASGD algorithms - as proposed in [18]_ + as proposed in :ref:`[18] ` Parameters @@ -419,13 +417,11 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None, [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01], [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]]) + + .. _references-solve-semi-dual-entropic: References ---------- - - [Genevay et al., 2016] : - Stochastic Optimization for Large-scale Optimal Transport, - Advances in Neural Information Processing Systems (2016), - arXiv preprint arxiv:1605.08527. + .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016). ''' if method.lower() == "sag": @@ -459,26 +455,30 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha, r''' Computes the partial gradient of the dual optimal transport problem. - For each (i,j) in a batch of coordinates, the partial gradients are : + For each :math:`(i,j)` in a batch of coordinates, the partial gradients are : .. math:: - \partial_{u_i} F = u_i * b_s/l_{v} - \sum_{j \in B_v} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j + \partial_{\mathbf{u}_i} F = \frac{b_s}{l_v} \mathbf{u}_i - + \sum_{j \in B_v} \mathbf{a}_i \mathbf{b}_j + \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right) - \partial_{v_j} F = v_j * b_s/l_{u} - \sum_{i \in B_u} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j + \partial_{\mathbf{v}_j} F = \frac{b_s}{l_u} \mathbf{v}_j - + \sum_{i \in B_u} \mathbf{a}_i \mathbf{b}_j + \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right) Where : - - M is the (ns,nt) metric cost matrix - - u, v are dual variables in R^ixR^J + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix + - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}` - reg is the regularization term - :math:`B_u` and :math:`B_v` are lists of index - - :math:`b_s` is the size of the batchs :math:`B_u` and :math:`B_v` - - :math:`l_u` and :math:`l_v` are the lenghts of :math:`B_u` and :math:`B_v` - - a and b are source and target weights (sum to 1) + - :math:`b_s` is the size of the batches :math:`B_u` and :math:`B_v` + - :math:`l_u` and :math:`l_v` are the lengths of :math:`B_u` and :math:`B_v` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) The algorithm used for solving the dual problem is the SGD algorithm - as proposed in [19]_ [alg.1] + as proposed in :ref:`[19] ` [alg.1] Parameters @@ -504,7 +504,7 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha, Returns ------- - grad : ndarray, shape (ns,) + grad : ndarray, shape (`ns`,) partial grad F Examples @@ -533,12 +533,11 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha, [5.06266486e-02, 2.16230494e-03, 2.26215141e-03, 6.81514609e-04], [6.06713990e-02, 3.98139808e-02, 5.46829338e-02, 8.62371424e-06]]) + + .. _references-batch-grad-dual: References ---------- - - [Seguy et al., 2018] : - International Conference on Learning Representation (2018), - arXiv preprint arxiv:1711.02283. + .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018) ''' G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] - M[batch_alpha, :][:, batch_beta]) / reg) * @@ -555,25 +554,25 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha, def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr): r''' - Compute the sgd algorithm to solve the regularized discrete measures - optimal transport dual problem + Compute the sgd algorithm to solve the regularized discrete measures optimal transport dual problem The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} \gamma \geq 0 Where : - - M is the (ns,nt) metric cost matrix + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) Parameters ---------- @@ -632,9 +631,7 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr): References ---------- - [Seguy et al., 2018] : - International Conference on Learning Representation (2018), - arXiv preprint arxiv:1711.02283. + .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018) ''' n_source = np.shape(M)[0] @@ -657,25 +654,25 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr): def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1, log=False): r''' - Compute the transportation matrix to solve the regularized discrete measures - optimal transport dual problem + Compute the transportation matrix to solve the regularized discrete measures optimal transport dual problem The function solves the following optimization problem: .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + + \mathrm{reg} \cdot\Omega(\gamma) - s.t. \gamma 1 = a + s.t. \ \gamma \mathbf{1} = \mathbf{a} - \gamma^T 1= b + \gamma^T \mathbf{1} = \mathbf{b} \gamma \geq 0 Where : - - M is the (ns,nt) metric cost matrix - - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target weights (sum to 1) + - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix + - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1) Parameters ---------- @@ -736,10 +733,7 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1, References ---------- - - [Seguy et al., 2018] : - International Conference on Learning Representation (2018), - arXiv preprint arxiv:1711.02283. + .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018) ''' opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size, diff --git a/ot/unbalanced.py b/ot/unbalanced.py index 6a61aa1..15e180b 100644 --- a/ot/unbalanced.py +++ b/ot/unbalanced.py @@ -23,29 +23,31 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000, The function solves the following optimization problem: .. math:: - W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b) + W = \min_\gamma \ \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b}) s.t. - \gamma\geq 0 + \gamma \geq 0 + where : - - M is the (dim_a, dim_b) metric cost matrix - - :math:`\Omega` is the entropic regularization - term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target unbalanced distributions + - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions - KL is the Kullback-Leibler divergence The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_ + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] ` Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists) - One or multiple unnormalized histograms of dimension dim_b - If many, compute all the OT distances (a, b_i) + One or multiple unnormalized histograms of dimension `dim_b`. + If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i` M : np.ndarray (dim_a, dim_b) loss matrix reg : float @@ -68,14 +70,14 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000, Returns ------- if n_hists == 1: - gamma : (dim_a x dim_b) ndarray + - gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters - log : dict + - log : dict log dictionary returned only if `log` is `True` else: - ot_distance : (n_hists,) ndarray - the OT distance between `a` and each of the histograms `b_i` - log : dict + - ot_distance : (n_hists,) ndarray + the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i` + - log : dict log dictionary returned only if `log` is `True` Examples @@ -90,9 +92,9 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000, [0.18807035, 0.51122823]]) + .. _references-sinkhorn-unbalanced: References ---------- - .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 @@ -111,11 +113,11 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000, See Also -------- - ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn [10] + ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn :ref:`[10] ` ot.unbalanced.sinkhorn_stabilized_unbalanced: - Unbalanced Stabilized sinkhorn [9][10] + Unbalanced Stabilized sinkhorn :ref:`[9, 10] ` ot.unbalanced.sinkhorn_reg_scaling_unbalanced: - Unbalanced Sinkhorn with epslilon scaling [9][10] + Unbalanced Sinkhorn with epslilon scaling :ref:`[9, 10] ` """ @@ -151,29 +153,30 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn', The function solves the following optimization problem: .. math:: - W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b) + W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b}) s.t. \gamma\geq 0 where : - - M is the (dim_a, dim_b) metric cost matrix - - :math:`\Omega` is the entropic regularization term - :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target unbalanced distributions + - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions - KL is the Kullback-Leibler divergence The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_ + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] ` Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists) - One or multiple unnormalized histograms of dimension dim_b - If many, compute all the OT distances (a, b_i) + One or multiple unnormalized histograms of dimension `dim_b`. + If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i` M : np.ndarray (dim_a, dim_b) loss matrix reg : float @@ -196,7 +199,7 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn', Returns ------- ot_distance : (n_hists,) ndarray - the OT distance between `a` and each of the histograms `b_i` + the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i` log : dict log dictionary returned only if `log` is `True` @@ -211,10 +214,9 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn', array([0.31912866]) - + .. _references-sinkhorn-unbalanced2: References ---------- - .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 @@ -232,9 +234,9 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn', See Also -------- - ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn [10] - ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn [9][10] - ot.unbalanced.sinkhorn_reg_scaling: Unbalanced Sinkhorn with epslilon scaling [9][10] + ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn :ref:`[10] ` + ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn :ref:`[9, 10] ` + ot.unbalanced.sinkhorn_reg_scaling: Unbalanced Sinkhorn with epslilon scaling :ref:`[9, 10] ` """ b = np.asarray(b, dtype=np.float64) @@ -270,26 +272,29 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000, The function solves the following optimization problem: .. math:: - W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \reg_m KL(\gamma 1, a) + \reg_m KL(\gamma^T 1, b) + W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b}) s.t. - \gamma\geq 0 + \gamma \geq 0 + where : - - M is the (dim_a, dim_b) metric cost matrix - - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target unbalanced distributions + - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions - KL is the Kullback-Leibler divergence - The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_ + The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] ` Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists) - One or multiple unnormalized histograms of dimension dim_b + One or multiple unnormalized histograms of dimension `dim_b` If many, compute all the OT distances (a, b_i) M : np.ndarray (dim_a, dim_b) loss matrix @@ -310,15 +315,16 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000, Returns ------- if n_hists == 1: - gamma : (dim_a x dim_b) ndarray + - gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters - log : dict + - log : dict log dictionary returned only if `log` is `True` else: - ot_distance : (n_hists,) ndarray - the OT distance between `a` and each of the histograms `b_i` - log : dict + - ot_distance : (n_hists,) ndarray + the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i` + - log : dict log dictionary returned only if `log` is `True` + Examples -------- @@ -330,9 +336,10 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000, array([[0.51122823, 0.18807035], [0.18807035, 0.51122823]]) + + .. _references-sinkhorn-knopp-unbalanced: References ---------- - .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816. @@ -445,32 +452,34 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000 problem and return the loss The function solves the following optimization problem using log-domain - stabilization as proposed in [10]: + stabilization as proposed in :ref:`[10] `: .. math:: - W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b) + W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) + + \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b}) s.t. - \gamma\geq 0 + \gamma \geq 0 + where : - - M is the (dim_a, dim_b) metric cost matrix - - :math:`\Omega` is the entropic regularization - term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - - a and b are source and target unbalanced distributions + - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix + - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions - KL is the Kullback-Leibler divergence The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_ + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] ` Parameters ---------- a : np.ndarray (dim_a,) - Unnormalized histogram of dimension dim_a + Unnormalized histogram of dimension `dim_a` b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists) - One or multiple unnormalized histograms of dimension dim_b - If many, compute all the OT distances (a, b_i) + One or multiple unnormalized histograms of dimension `dim_b`. + If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i` M : np.ndarray (dim_a, dim_b) loss matrix reg : float @@ -492,14 +501,14 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000 Returns ------- if n_hists == 1: - gamma : (dim_a x dim_b) ndarray + - gamma : (dim_a, dim_b) ndarray Optimal transportation matrix for the given parameters - log : dict + - log : dict log dictionary returned only if `log` is `True` else: - ot_distance : (n_hists,) ndarray - the OT distance between `a` and each of the histograms `b_i` - log : dict + - ot_distance : (n_hists,) ndarray + the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i` + - log : dict log dictionary returned only if `log` is `True` Examples -------- @@ -512,9 +521,10 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000 array([[0.51122823, 0.18807035], [0.18807035, 0.51122823]]) + + .. _references-sinkhorn-stabilized-unbalanced: References ---------- - .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816. @@ -654,29 +664,27 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000 def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, numItermax=1000, stopThr=1e-6, verbose=False, log=False): - r"""Compute the entropic unbalanced wasserstein barycenter of A with stabilization. + r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}` with stabilization. The function solves the following optimization problem: .. math:: - \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i) where : - - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized - Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced) - - :math:`\mathbf{a}_i` are training distributions in the columns of - matrix :math:`\mathbf{A}` - - reg and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`) + - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}` + - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT - reg_mis the marginal relaxation hyperparameter - The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_ + + The algorithm used for solving the problem is the generalized + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] ` Parameters ---------- A : np.ndarray (dim, n_hists) - `n_hists` training distributions a_i of dimension dim + `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim` M : np.ndarray (dim, dim) ground metric matrix for OT. reg : float @@ -706,9 +714,9 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, log dictionary return only if log==True in parameters + .. _references-barycenter-unbalanced-stabilized: References ---------- - .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138. @@ -806,29 +814,27 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None, numItermax=1000, stopThr=1e-6, verbose=False, log=False): - r"""Compute the entropic unbalanced wasserstein barycenter of A. + r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}`. - The function solves the following optimization problem with a + The function solves the following optimization problem with :math:`\mathbf{a}` .. math:: - \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i) where : - - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized - Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced) - - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` - - reg and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`) + - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}` + - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT - reg_mis the marginal relaxation hyperparameter + The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_ + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] ` Parameters ---------- A : np.ndarray (dim, n_hists) - `n_hists` training distributions a_i of dimension dim + `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim` M : np.ndarray (dim, dim) ground metric matrix for OT. reg : float @@ -856,9 +862,9 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None, log dictionary return only if log==True in parameters + .. _references-barycenter-unbalanced-sinkhorn: References ---------- - .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138. @@ -936,29 +942,27 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None, def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None, numItermax=1000, stopThr=1e-6, verbose=False, log=False, **kwargs): - r"""Compute the entropic unbalanced wasserstein barycenter of A. + r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}`. - The function solves the following optimization problem with a + The function solves the following optimization problem with :math:`\mathbf{a}` .. math:: - \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i) + \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i) where : - - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized - Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced) - - :math:`\mathbf{a}_i` are training distributions in the columns of matrix - :math:`\mathbf{A}` - - reg and :math:`\mathbf{M}` are respectively the regularization term and - the cost matrix for OT + - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`) + - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}` + - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT - reg_mis the marginal relaxation hyperparameter + The algorithm used for solving the problem is the generalized - Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_ + Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] ` Parameters ---------- A : np.ndarray (dim, n_hists) - `n_hists` training distributions a_i of dimension dim + `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim` M : np.ndarray (dim, dim) ground metric matrix for OT. reg : float @@ -986,9 +990,9 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None, log dictionary return only if log==True in parameters + .. _references-barycenter-unbalanced: References ---------- - .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138. diff --git a/ot/utils.py b/ot/utils.py index 0608aee..c878563 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -21,26 +21,26 @@ __time_tic_toc = time.time() def tic(): - """ Python implementation of Matlab tic() function """ + r""" Python implementation of Matlab tic() function """ global __time_tic_toc __time_tic_toc = time.time() def toc(message='Elapsed time : {} s'): - """ Python implementation of Matlab toc() function """ + r""" Python implementation of Matlab toc() function """ t = time.time() print(message.format(t - __time_tic_toc)) return t - __time_tic_toc def toq(): - """ Python implementation of Julia toc() function """ + r""" Python implementation of Julia toc() function """ t = time.time() return t - __time_tic_toc def kernel(x1, x2, method='gaussian', sigma=1, **kwargs): - """Compute kernel matrix""" + r"""Compute kernel matrix""" nx = get_backend(x1, x2) @@ -50,13 +50,13 @@ def kernel(x1, x2, method='gaussian', sigma=1, **kwargs): def laplacian(x): - """Compute Laplacian matrix""" + r"""Compute Laplacian matrix""" L = np.diag(np.sum(x, axis=0)) - x return L def list_to_array(*lst): - """ Convert a list if in numpy format """ + r""" Convert a list if in numpy format """ if len(lst) > 1: return [np.array(a) if isinstance(a, list) else a for a in lst] else: @@ -64,17 +64,18 @@ def list_to_array(*lst): def proj_simplex(v, z=1): - r""" compute the closest point (orthogonal projection) on the - generalized (n-1)-simplex of a vector v wrt. to the Euclidean + r"""Compute the closest point (orthogonal projection) on the + generalized `(n-1)`-simplex of a vector :math:`\mathbf{v}` wrt. to the Euclidean distance, thus solving: + .. math:: - \mathcal{P}(w) \in arg\min_\gamma || \gamma - v ||_2 + \mathcal{P}(w) \in \mathop{\arg \min}_\gamma \| \gamma - \mathbf{v} \|_2 - s.t. \gamma^T 1= z + s.t. \ \gamma^T \mathbf{1} = z - \gamma\geq 0 + \gamma \geq 0 - If v is a 2d array, compute all the projections wrt. axis 0 + If :math:`\mathbf{v}` is a 2d array, compute all the projections wrt. axis 0 .. note:: This function is backend-compatible and will work on arrays from all compatible backends. @@ -87,7 +88,7 @@ def proj_simplex(v, z=1): Returns ------- - h : ndarray, shape (n,d) + h : ndarray, shape (`n`, `d`) Array of projections on the simplex """ nx = get_backend(v) @@ -116,26 +117,24 @@ def proj_simplex(v, z=1): def unif(n): - """ return a uniform histogram of length n (simplex) + r""" + Return a uniform histogram of length `n` (simplex). Parameters ---------- - n : int number of bins in the histogram Returns ------- - h : np.array (n,) - histogram of length n such that h_i=1/n for all i - - + h : np.array (`n`,) + histogram of length `n` such that :math:`\forall i, \mathbf{h}_i = \frac{1}{n}` """ return np.ones((n,)) / n def clean_zeros(a, b, M): - """ Remove all components with zeros weights in a and b + r""" Remove all components with zeros weights in :math:`\mathbf{a}` and :math:`\mathbf{b}` """ M2 = M[a > 0, :][:, b > 0].copy() # copy force c style matrix (froemd) a2 = a[a > 0] @@ -144,8 +143,8 @@ def clean_zeros(a, b, M): def euclidean_distances(X, Y, squared=False): - """ - Considering the rows of X (and Y=X) as vectors, compute the + r""" + Considering the rows of :math:`\mathbf{X}` (and :math:`\mathbf{Y} = \mathbf{X}`) as vectors, compute the distance matrix between each pair of vectors. .. note:: This function is backend-compatible and will work on arrays @@ -153,14 +152,14 @@ def euclidean_distances(X, Y, squared=False): Parameters ---------- - X : {array-like}, shape (n_samples_1, n_features) - Y : {array-like}, shape (n_samples_2, n_features) + X : array-like, shape (n_samples_1, n_features) + Y : array-like, shape (n_samples_2, n_features) squared : boolean, optional Return squared Euclidean distances. Returns ------- - distances : {array}, shape (n_samples_1, n_samples_2) + distances : array-like, shape (`n_samples_1`, `n_samples_2`) """ nx = get_backend(X, Y) @@ -184,7 +183,7 @@ def euclidean_distances(X, Y, squared=False): def dist(x1, x2=None, metric='sqeuclidean', p=2): - """Compute distance between samples in x1 and x2 + r"""Compute distance between samples in :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}` .. note:: This function is backend-compatible and will work on arrays from all compatible backends. @@ -193,9 +192,9 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2): ---------- x1 : array-like, shape (n1,d) - matrix with n1 samples of size d + matrix with `n1` samples of size `d` x2 : array-like, shape (n2,d), optional - matrix with n2 samples of size d (if None then x2=x1) + matrix with `n2` samples of size `d` (if None then :math:`\mathbf{x_2} = \mathbf{x_1}`) metric : str | callable, optional 'sqeuclidean' or 'euclidean' on all backends. On numpy the function also accepts from the scipy.spatial.distance.cdist function : 'braycurtis', @@ -208,7 +207,7 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2): Returns ------- - M : array-like, shape (n1, n2) + M : array-like, shape (`n1`, `n2`) distance matrix computed with given metric """ @@ -226,7 +225,7 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2): def dist0(n, method='lin_square'): - """Compute standard cost matrices of size (n, n) for OT problems + r"""Compute standard cost matrices of size (`n`, `n`) for OT problems Parameters ---------- @@ -235,11 +234,11 @@ def dist0(n, method='lin_square'): method : str, optional Type of loss matrix chosen from: - * 'lin_square' : linear sampling between 0 and n-1, quadratic loss + * 'lin_square' : linear sampling between 0 and `n-1`, quadratic loss Returns ------- - M : ndarray, shape (n1,n2) + M : ndarray, shape (`n1`, `n2`) Distance matrix computed with given metric. """ res = 0 @@ -250,7 +249,7 @@ def dist0(n, method='lin_square'): def cost_normalization(C, norm=None): - """ Apply normalization to the loss matrix + r""" Apply normalization to the loss matrix Parameters ---------- @@ -262,7 +261,7 @@ def cost_normalization(C, norm=None): Returns ------- - C : ndarray, shape (n1, n2) + C : ndarray, shape (`n1`, `n2`) The input cost matrix normalized according to given norm. """ @@ -284,23 +283,23 @@ def cost_normalization(C, norm=None): def dots(*args): - """ dots function for multiple matrix multiply """ + r""" dots function for multiple matrix multiply """ return reduce(np.dot, args) def label_normalization(y, start=0): - """ Transform labels to start at a given value + r""" Transform labels to start at a given value Parameters ---------- y : array-like, shape (n, ) The vector of labels to be normalized. start : int - Desired value for the smallest label in y (default=0) + Desired value for the smallest label in :math:`\mathbf{y}` (default=0) Returns ------- - y : array-like, shape (n1, ) + y : array-like, shape (`n1`, ) The input vector of labels normalized according to given start value. """ @@ -311,14 +310,14 @@ def label_normalization(y, start=0): def parmap(f, X, nprocs="default"): - """ paralell map for multiprocessing. + r""" parallel map for multiprocessing. The function has been deprecated and only performs a regular map. """ return list(map(f, X)) def check_params(**kwargs): - """check_params: check whether some parameters are missing + r"""check_params: check whether some parameters are missing """ missing_params = [] @@ -339,14 +338,14 @@ def check_params(**kwargs): def check_random_state(seed): - """Turn seed into a np.random.RandomState instance + r"""Turn `seed` into a np.random.RandomState instance Parameters ---------- seed : None | int | instance of RandomState - If seed is None, return the RandomState singleton used by np.random. - If seed is an int, return a new RandomState instance seeded with seed. - If seed is already a RandomState instance, return it. + If `seed` is None, return the RandomState singleton used by np.random. + If `seed` is an int, return a new RandomState instance seeded with `seed`. + If `seed` is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is np.random: @@ -360,18 +359,21 @@ def check_random_state(seed): class deprecated(object): - """Decorator to mark a function or class as deprecated. + r"""Decorator to mark a function or class as deprecated. deprecated class from scikit-learn package https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/deprecation.py Issue a warning when the function is called/the class is instantiated and adds a warning to the docstring. The optional extra argument will be appended to the deprecation message - and the docstring. Note: to use this with the default value for extra, put - in an empty of parentheses: - >>> from ot.deprecation import deprecated # doctest: +SKIP - >>> @deprecated() # doctest: +SKIP - ... def some_function(): pass # doctest: +SKIP + and the docstring. + + .. note:: + To use this with the default value for extra, use empty parentheses: + + >>> from ot.deprecation import deprecated # doctest: +SKIP + >>> @deprecated() # doctest: +SKIP + ... def some_function(): pass # doctest: +SKIP Parameters ---------- @@ -386,7 +388,7 @@ class deprecated(object): self.extra = extra def __call__(self, obj): - """Call method + r"""Call method Parameters ---------- obj : object @@ -417,7 +419,7 @@ class deprecated(object): return cls def _decorate_fun(self, fun): - """Decorate function fun""" + r"""Decorate function fun""" msg = "Function %s is deprecated" % fun.__name__ if self.extra: @@ -443,7 +445,7 @@ class deprecated(object): def _is_deprecated(func): - """Helper to check if func is wraped by our deprecated decorator""" + r"""Helper to check if func is wraped by our deprecated decorator""" if sys.version_info < (3, 5): raise NotImplementedError("This is only available for python3.5 " "or above") @@ -457,7 +459,7 @@ def _is_deprecated(func): class BaseEstimator(object): - """Base class for most objects in POT + r"""Base class for most objects in POT Code adapted from sklearn BaseEstimator class @@ -470,7 +472,7 @@ class BaseEstimator(object): @classmethod def _get_param_names(cls): - """Get parameter names for the estimator""" + r"""Get parameter names for the estimator""" # fetch the constructor or the original constructor before # deprecation wrapping if any @@ -497,7 +499,7 @@ class BaseEstimator(object): return sorted([p.name for p in parameters]) def get_params(self, deep=True): - """Get parameters for this estimator. + r"""Get parameters for this estimator. Parameters ---------- @@ -534,7 +536,7 @@ class BaseEstimator(object): return out def set_params(self, **params): - """Set the parameters of this estimator. + r"""Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form @@ -574,7 +576,7 @@ class BaseEstimator(object): class UndefinedParameter(Exception): - """ + r""" Aim at raising an Exception when a undefined parameter is called """ -- cgit v1.2.3