1 files changed, 93 insertions, 99 deletions
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 13ed9cc..693675f 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -18,22 +18,25 @@ import numpy as np
 
 def coordinate_grad_semi_dual(b, M, reg, beta, i):
     r'''
-    Compute the coordinate gradient update for regularized discrete distributions for (i, :)
+    Compute the coordinate gradient update for regularized discrete distributions for :math:`(i, :)`
 
     The function computes the gradient of the semi dual problem:
 
     .. math::
-        \max_v \sum_i (\sum_j v_j * b_j - reg * log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i
+        \max_\mathbf{v} \ \sum_i \mathbf{a}_i \left[ \sum_j \mathbf{v}_j \mathbf{b}_j - \mathrm{reg}
+        \cdot \log \left( \sum_j \mathbf{b}_j
+        \exp \left( \frac{\mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}}
+        \right) \right) \right]
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - v is a dual variable in R^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{v}` is a dual variable in :math:`\mathbb{R}^{nt}`
     - reg is the regularization term
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the ASGD & SAG algorithms
-    as proposed in [18]_ [alg.1 & alg.2]
+    as proposed in :ref:`[18] <references-coordinate-grad-semi-dual>` [alg.1 & alg.2]
 
 
     Parameters
@@ -47,7 +50,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
     v : ndarray, shape (nt,)
         Dual variable.
     i : int
-        Picked number i.
+        Picked number `i`.
 
     Returns
     -------
@@ -74,12 +77,10 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
 
+    .. _references-coordinate-grad-semi-dual:
     References
     ----------
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
     r = M[i, :] - beta
     exp_beta = np.exp(-r / reg) * b
@@ -88,29 +89,29 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
 
 def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
-    r'''
-    Compute the SAG algorithm to solve the regularized discrete measures
-        optimal transport max problem
+    r"""
+    Compute the SAG algorithm to solve the regularized discrete measures optimal transport max problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1 = b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the SAG algorithm
-    as proposed in [18]_ [alg.1]
+    as proposed in :ref:`[18] <references-sag-entropic-transport>` [alg.1]
 
 
     Parameters
@@ -131,7 +132,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
 
     Returns
     -------
-    v : ndarray, shape (nt,)
+    v : ndarray, shape (`nt`,)
         Dual variable.
 
     Examples
@@ -154,14 +155,12 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-sag-entropic-transport:
     References
     ----------
-
-    [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
-    '''
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
+    """
 
     if lr is None:
         lr = 1. / max(a / reg)
@@ -187,22 +186,23 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the ASGD algorithm
-    as proposed in [18]_ [alg.2]
+    as proposed in :ref:`[18] <references-averaged-sgd-entropic-transport>` [alg.2]
 
 
     Parameters
@@ -220,7 +220,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
     Returns
     -------
-    ave_v : ndarray, shape (nt,)
+    ave_v : ndarray, shape (`nt`,)
         dual variable
 
     Examples
@@ -243,13 +243,11 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-averaged-sgd-entropic-transport:
     References
     ----------
-
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     if lr is None:
@@ -271,20 +269,21 @@ def c_transform_entropic(b, M, reg, beta):
     r'''
     The goal is to recover u from the c-transform.
 
-    The function computes the c_transform of a dual variable from the other
+    The function computes the c-transform of a dual variable from the other
     dual variable:
 
     .. math::
-        u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j
+        \mathbf{u} = \mathbf{v}^{c,reg} = - \mathrm{reg} \sum_j \mathbf{b}_j
+        \exp\left( \frac{\mathbf{v} - \mathbf{M}}{\mathrm{reg}} \right)
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - u, v are dual variables in R^IxR^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}`
     - reg is the regularization term
 
     It is used to recover an optimal u from optimal v solving the semi dual
-    problem, see Proposition 2.1 of [18]_
+    problem, see Proposition 2.1 of :ref:`[18] <references-c-transform-entropic>`
 
 
     Parameters
@@ -300,7 +299,7 @@ def c_transform_entropic(b, M, reg, beta):
 
     Returns
     -------
-    u : ndarray, shape (ns,)
+    u : ndarray, shape (`ns`,)
         Dual variable.
 
     Examples
@@ -323,13 +322,11 @@ def c_transform_entropic(b, M, reg, beta):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-c-transform-entropic:
     References
     ----------
-
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     n_source = np.shape(M)[0]
@@ -345,27 +342,28 @@ def c_transform_entropic(b, M, reg, beta):
 def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                 log=False):
     r'''
-    Compute the transportation matrix to solve the regularized discrete
-        measures optimal transport max problem
+    Compute the transportation matrix to solve the regularized discrete measures optimal transport max problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
+
     The algorithm used for solving the problem is the SAG or ASGD algorithms
-    as proposed in [18]_
+    as proposed in :ref:`[18] <references-solve-semi-dual-entropic>`
 
 
     Parameters
@@ -419,13 +417,11 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-solve-semi-dual-entropic:
     References
     ----------
-
-    [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     if method.lower() == "sag":
@@ -459,26 +455,30 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
     r'''
     Computes the partial gradient of the dual optimal transport problem.
 
-    For each (i,j) in a batch of coordinates, the partial gradients are :
+    For each :math:`(i,j)` in a batch of coordinates, the partial gradients are :
 
     .. math::
-        \partial_{u_i} F = u_i * b_s/l_{v} - \sum_{j \in B_v} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+        \partial_{\mathbf{u}_i} F = \frac{b_s}{l_v} \mathbf{u}_i -
+        \sum_{j \in B_v} \mathbf{a}_i \mathbf{b}_j
+        \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right)
 
-        \partial_{v_j} F = v_j * b_s/l_{u} - \sum_{i \in B_u} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+        \partial_{\mathbf{v}_j} F = \frac{b_s}{l_u} \mathbf{v}_j -
+        \sum_{i \in B_u} \mathbf{a}_i \mathbf{b}_j
+        \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right)
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - u, v are dual variables in R^ixR^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}`
     - reg is the regularization term
     - :math:`B_u` and :math:`B_v` are lists of index
-    - :math:`b_s` is the size of the batchs :math:`B_u` and :math:`B_v`
-    - :math:`l_u` and :math:`l_v` are the lenghts of :math:`B_u` and :math:`B_v`
-    - a and b are source and target weights (sum to 1)
+    - :math:`b_s` is the size of the batches :math:`B_u` and :math:`B_v`
+    - :math:`l_u` and :math:`l_v` are the lengths of :math:`B_u` and :math:`B_v`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
 
     The algorithm used for solving the dual problem is the SGD algorithm
-    as proposed in [19]_ [alg.1]
+    as proposed in :ref:`[19] <references-batch-grad-dual>` [alg.1]
 
 
     Parameters
@@ -504,7 +504,7 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
     Returns
     -------
-    grad : ndarray, shape (ns,)
+    grad : ndarray, shape (`ns`,)
         partial grad F
 
     Examples
@@ -533,12 +533,11 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
            [5.06266486e-02, 2.16230494e-03, 2.26215141e-03, 6.81514609e-04],
            [6.06713990e-02, 3.98139808e-02, 5.46829338e-02, 8.62371424e-06]])
 
+
+    .. _references-batch-grad-dual:
     References
     ----------
-
-    [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
     G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
                    M[batch_alpha, :][:, batch_beta]) / reg) *
@@ -555,25 +554,25 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
 def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
     r'''
-    Compute the sgd algorithm to solve the regularized discrete measures
-        optimal transport dual problem
+    Compute the sgd algorithm to solve the regularized discrete measures optimal transport dual problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     Parameters
     ----------
@@ -632,9 +631,7 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     References
     ----------
-    [Seguy et al., 2018] :
-        International Conference on Learning Representation (2018),
-          arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
 
     n_source = np.shape(M)[0]
@@ -657,25 +654,25 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
                         log=False):
     r'''
-    Compute the transportation matrix to solve the regularized discrete measures
-        optimal transport dual problem
+    Compute the transportation matrix to solve the regularized discrete measures optimal transport dual problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     Parameters
     ----------
@@ -736,10 +733,7 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     References
     ----------
-
-    [Seguy et al., 2018] :
-        International Conference on Learning Representation (2018),
-          arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
 
     opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,