From 9c6ac880d426b7577918b0c77bd74b3b01930ef6 Mon Sep 17 00:00:00 2001
From: ncassereau-idris <84033440+ncassereau-idris@users.noreply.github.com>
Date: Wed, 3 Nov 2021 17:29:16 +0100
Subject: [MRG] Docs updates (#298)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* bregman docs

* sliced docs

* docs partial

* unbalanced docs

* stochastic docs

* plot docs

* datasets docs

* utils docs

* dr docs

* dr docs corrected

* smooth docs

* docs da

* pep8

* docs gromov

* more space after min and argmin

* docs lp

* bregman docs

* bregman docs mistake corrected

* pep8

Co-authored-by: Rémi Flamary <remi.flamary@gmail.com>
---
 ot/bregman.py     | 236 ++++++++++++++------------
 ot/da.py          | 499 ++++++++++++++++++++++++++++++------------------------
 ot/datasets.py    |  12 +-
 ot/dr.py          |  40 +++--
 ot/gromov.py      |  35 ++--
 ot/lp/__init__.py | 100 ++++++-----
 ot/optim.py       |   8 +-
 ot/partial.py     | 283 +++++++++++++++++--------------
 ot/plot.py        |  10 +-
 ot/sliced.py      |   7 +-
 ot/smooth.py      | 179 ++++++++++++--------
 ot/stochastic.py  | 192 ++++++++++-----------
 ot/unbalanced.py  | 206 +++++++++++-----------
 ot/utils.py       | 118 ++++++-------
 14 files changed, 1048 insertions(+), 877 deletions(-)

diff --git a/ot/bregman.py b/ot/bregman.py
index 786f151..cce52e2 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -33,7 +33,8 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -45,9 +46,9 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target
-    weights (histograms, both sum to 1)
+      weights (histograms, both sum to 1)
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
@@ -70,7 +71,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     version of the sinkhorn :py:func:`ot.bregman.screenkhorn` aim at providing  a
     fast approximation of the Sinkhorn problem. For use of GPU and gradient
     computation with small number of iterations we strongly recommend the
-    :any:`ot.bregman.sinkhorn_log` solver that will no need to check for
+    :py:func:`ot.bregman.sinkhorn_log` solver that will no need to check for
     numerical problems.
 
 
@@ -189,7 +190,8 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -201,9 +203,9 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target
-    weights (histograms, both sum to 1)
+      weights (histograms, both sum to 1)
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
@@ -217,17 +219,17 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     By default and when using a regularization parameter that is not too small
     the default sinkhorn solver should be enough. If you need to use a small
     regularization to get sharper OT matrices, you should use the
-    :any:`ot.bregman.sinkhorn_log` solver that will avoid numerical
+    :py:func:`ot.bregman.sinkhorn_log` solver that will avoid numerical
     errors. This last solver can be very slow in practice and might not even
     converge to a reasonable OT matrix in a finite time. This is why
-    :any:`ot.bregman.sinkhorn_epsilon_scaling` that relies on iterating the value
+    :py:func:`ot.bregman.sinkhorn_epsilon_scaling` that relies on iterating the value
     of the regularization (and using warm start) sometimes leads to better
     solutions. Note that the greedy version of the sinkhorn
-    :any:`ot.bregman.greenkhorn` can also lead to a speedup and the screening
-    version of the sinkhorn :any:`ot.bregman.screenkhorn` aim a providing  a
+    :py:func:`ot.bregman.greenkhorn` can also lead to a speedup and the screening
+    version of the sinkhorn :py:func:`ot.bregman.screenkhorn` aim a providing  a
     fast approximation of the Sinkhorn problem. For use of GPU and gradient
     computation with small number of iterations we strongly recommend the
-    :any:`ot.bregman.sinkhorn_log` solver that will no need to check for
+    :py:func:`ot.bregman.sinkhorn_log` solver that will no need to check for
     numerical problems.
 
     Parameters
@@ -301,15 +303,15 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
         divergences. In The 22nd International Conference on Artificial
         Intelligence and Statistics (pp. 2681-2690). PMLR.
 
+
     See Also
     --------
     ot.lp.emd : Unregularized OT
     ot.optim.cg : General regularized OT
     ot.bregman.sinkhorn_knopp : Classic Sinkhorn :ref:`[2] <references-sinkhorn2>`
     ot.bregman.greenkhorn : Greenkhorn :ref:`[21] <references-sinkhorn2>`
-    ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn :ref:`[9] <references-sinkhorn2>`
-    :ref:`[10] <references-sinkhorn2>`
-
+    ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn
+        :ref:`[9] <references-sinkhorn2>` :ref:`[10] <references-sinkhorn2>`
     """
 
     M, a, b = list_to_array(M, a, b)
@@ -362,7 +364,8 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -373,9 +376,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target
-    weights (histograms, both sum to 1)
+      weights (histograms, both sum to 1)
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp
     matrix scaling algorithm as proposed in :ref:`[2] <references-sinkhorn-knopp>`
@@ -543,7 +546,8 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -553,12 +557,13 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
-    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega` is the entropic regularization term
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (histograms, both sum to 1)
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
-    scaling algorithm  :ref:`[2] <references-sinkhorn-knopp>` with the
-    implementation from :ref:`[34] <references-sinkhorn-knopp>`
+    scaling algorithm  :ref:`[2] <references-sinkhorn-log>` with the
+    implementation from :ref:`[34] <references-sinkhorn-log>`
 
 
     Parameters
@@ -744,7 +749,8 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -755,9 +761,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target
-    weights (histograms, both sum to 1)
+      weights (histograms, both sum to 1)
 
 
     Parameters
@@ -903,7 +909,8 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -914,9 +921,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target
-    weights (histograms, both sum to 1)
+      weights (histograms, both sum to 1)
 
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
@@ -1145,20 +1152,24 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
     Solve the entropic regularization optimal transport problem with log
     stabilization and epsilon scaling.
     The function solves the following optimization problem:
+
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
              \gamma^T \mathbf{1} &= \mathbf{b}
 
              \gamma &\geq 0
+
     where :
+
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights
-    (histograms, both sum to 1)
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (histograms, both sum to 1)
+
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
     scaling algorithm as proposed in :ref:`[2] <references-sinkhorn-epsilon-scaling>`
     but with the log stabilization
@@ -1340,17 +1351,17 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
-    - :math:`OT_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein
-    distance (see :py:func:`ot.bregman.sinkhorn`)
-    if `method` is `sinkhorn` or `sinkhorn_stabilized` or `sinkhorn_log`.
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein
+      distance (see :py:func:`ot.bregman.sinkhorn`)
+      if `method` is `sinkhorn` or `sinkhorn_stabilized` or `sinkhorn_log`.
     - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
+      :math:`\mathbf{A}`
     - `reg` and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+      the cost matrix for OT
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling
     algorithm as proposed in :ref:`[3] <references-barycenter>`
@@ -1424,16 +1435,16 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
     - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance
-    (see :py:func:`ot.bregman.sinkhorn`)
+      (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
+      :math:`\mathbf{A}`
     - `reg` and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+      the cost matrix for OT
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
     scaling algorithm as proposed in :ref:`[3]<references-barycenter-sinkhorn>`.
@@ -1598,16 +1609,16 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
     - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein
-    distance (see :py:func:`ot.bregman.sinkhorn`)
+      distance (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
+      :math:`\mathbf{A}`
     - `reg` and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+      the cost matrix for OT
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling
     algorithm as proposed in :ref:`[3] <references-barycenter-stabilized>`
@@ -1736,24 +1747,24 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
     - :math:`S_{reg}(\cdot,\cdot)` is the debiased Sinkhorn divergence
-    (see :py:func:`ot.bregman.emirical_sinkhorn_divergence`)
+      (see :py:func:`ot.bregman.empirical_sinkhorn_divergence`)
     - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
+      :math:`\mathbf{A}`
     - `reg` and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+      the cost matrix for OT
 
     The algorithm used for solving the problem is the debiased Sinkhorn
-    algorithm as proposed in :ref:`[37] <references-sinkhorn-debiased>`
+    algorithm as proposed in :ref:`[37] <references-barycenter-debiased>`
 
     Parameters
     ----------
     A : array-like, shape (dim, n_hists)
-        `n_hists` training distributions :math:`a_i` of size `dim`
+        `n_hists` training distributions :math:`\mathbf{a}_i` of size `dim`
     M : array-like, shape (dim, dim)
         loss matrix for OT
     reg : float
@@ -1761,7 +1772,7 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1
     method : str (optional)
         method used for the solver either 'sinkhorn' or 'sinkhorn_log'
     weights : array-like, shape (n_hists,)
-        Weights of each histogram :math:`a_i` on the simplex (barycentric coodinates)
+        Weights of each histogram :math:`\mathbf{a}_i` on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
@@ -1774,7 +1785,6 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1
         if True, raises a warning if the algorithm doesn't convergence.
 
 
-
     Returns
     -------
     a : (dim,) array-like
@@ -1782,12 +1792,12 @@ def barycenter_debiased(A, M, reg, weights=None, method="sinkhorn", numItermax=1
     log : dict
         log dictionary return only if log==True in parameters
 
-     .. _references-sinkhorn-debiased:
-     References
-     ----------
 
-     .. [37] Janati, H., Cuturi, M., Gramfort, A. Proceedings of the 37th International
-     Conference on Machine Learning, PMLR 119:4692-4701, 2020
+    .. _references-barycenter-debiased:
+    References
+    ----------
+    .. [37] Janati, H., Cuturi, M., Gramfort, A. Proceedings of the 37th International
+        Conference on Machine Learning, PMLR 119:4692-4701, 2020
     """
 
     if method.lower() == 'sinkhorn':
@@ -1934,20 +1944,20 @@ def _barycenter_debiased_log(A, M, reg, weights=None, numItermax=1000,
 def convolutional_barycenter2d(A, reg, weights=None, method="sinkhorn", numItermax=10000,
                                stopThr=1e-4, verbose=False, log=False,
                                warn=True, **kwargs):
-    r"""Compute the entropic regularized wasserstein barycenter of distributions A
-    where A is a collection of 2D images.
+    r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}`
+    where :math:`\mathbf{A}` is a collection of 2D images.
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
     - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein
-    distance (see :py:func:`ot.bregman.sinkhorn`)
+      distance (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two dimensions
-    of matrix :math:`\mathbf{A}`
+      of matrix :math:`\mathbf{A}`
     - `reg` is the regularization strength scalar value
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm
@@ -2166,24 +2176,24 @@ def convolutional_barycenter2d_debiased(A, reg, weights=None, method="sinkhorn",
                                         numItermax=10000, stopThr=1e-3,
                                         verbose=False, log=False, warn=True,
                                         **kwargs):
-    r"""Compute the debiased sinkhorn barycenter of distributions A
-    where A is a collection of 2D images.
+    r"""Compute the debiased sinkhorn barycenter of distributions :math:`\mathbf{A}`
+    where :math:`\mathbf{A}` is a collection of 2D images.
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i S_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
     - :math:`S_{reg}(\cdot,\cdot)` is the debiased entropic regularized Wasserstein
-    distance (see :py:func:`ot.bregman.sinkhorn_debiased`)
+      distance (see :py:func:`ot.bregman.barycenter_debiased`)
     - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two
-    dimensions of matrix :math:`\mathbf{A}`
+      dimensions of matrix :math:`\mathbf{A}`
     - `reg` is the regularization strength scalar value
 
     The algorithm used for solving the problem is the debiased Sinkhorn scaling
-    algorithm as proposed in :ref:`[37] <references-sinkhorn-debiased>`
+    algorithm as proposed in :ref:`[37] <references-convolutional-barycenter2d-debiased>`
 
     Parameters
     ----------
@@ -2217,7 +2227,7 @@ def convolutional_barycenter2d_debiased(A, reg, weights=None, method="sinkhorn",
         log dictionary return only if log==True in parameters
 
 
-    .. _references-sinkhorn-debiased:
+    .. _references-convolutional-barycenter2d-debiased:
     References
     ----------
 
@@ -2406,23 +2416,25 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     .. math::
 
-       \mathbf{h} = \mathop{\arg \min}_\mathbf{h} (1- \alpha)  W_{\mathbf{M}, \mathrm{reg}}(\mathbf{a},\mathbf{Dh})+\alpha W_{\mathbf{M_0},\mathrm{reg}_0}(\mathbf{h}_0,\mathbf{h})
+       \mathbf{h} = \mathop{\arg \min}_\mathbf{h} \quad
+       (1 - \alpha)  W_{\mathbf{M}, \mathrm{reg}}(\mathbf{a}, \mathbf{Dh}) +
+       \alpha W_{\mathbf{M_0}, \mathrm{reg}_0}(\mathbf{h}_0, \mathbf{h})
 
 
     where :
 
     - :math:`W_{M,reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance
-    with M loss matrix (see :py:func:`ot.bregman.sinkhorn`)
+      with :math:`\mathbf{M}` loss matrix (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{D}` is a dictionary of `n_atoms` atoms of dimension `dim_a`,
-    its expected shape is `(dim_a, n_atoms)`
+      its expected shape is `(dim_a, n_atoms)`
     - :math:`\mathbf{h}` is the estimated unmixing of dimension `n_atoms`
     - :math:`\mathbf{a}` is an observed distribution of dimension `dim_a`
     - :math:`\mathbf{h}_0` is a prior on :math:`\mathbf{h}` of dimension `dim_prior`
     - `reg` and :math:`\mathbf{M}` are respectively the regularization term and the
-    cost matrix (`dim_a`, `dim_a`) for OT data fitting
+      cost matrix (`dim_a`, `dim_a`) for OT data fitting
     - `reg`:math:`_0` and :math:`\mathbf{M_0}` are respectively the regularization
-    term and the cost matrix (`dim_prior`, `n_atoms`) regularization
-    - :math:`\\alpha` weight data fitting and regularization
+      term and the cost matrix (`dim_prior`, `n_atoms`) regularization
+    - :math:`\alpha` weight data fitting and regularization
 
     The optimization problem is solved following the algorithm described
     in :ref:`[4] <references-unmix>`
@@ -2535,7 +2547,7 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100,
 
     .. math::
 
-        \mathbf{h} = \mathop{\arg \min}_{\mathbf{h}} \sum_{k=1}^{K} \lambda_k
+        \mathbf{h} = \mathop{\arg \min}_{\mathbf{h}} \quad \sum_{k=1}^{K} \lambda_k
                     W_{reg}((\mathbf{D}_2^{(k)} \mathbf{h})^T, \mathbf{a})
 
         s.t. \ \forall k, \mathbf{D}_1^{(k)} \gamma_k \mathbf{1}_n= \mathbf{h}
@@ -2544,15 +2556,15 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100,
 
     - :math:`\lambda_k` is the weight of `k`-th source domain
     - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance
-    (see :py:func:`ot.bregman.sinkhorn`)
+      (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{D}_2^{(k)}` is a matrix of weights related to `k`-th source domain
-    defined as in [p. 5, :ref:`27 <references-jcpot-barycenter>`], its expected shape
-    is :math:`(n_k, C)` where :math:`n_k` is the number of elements in the `k`-th source
-    domain and `C` is the number of classes
+      defined as in [p. 5, :ref:`27 <references-jcpot-barycenter>`], its expected shape
+      is :math:`(n_k, C)` where :math:`n_k` is the number of elements in the `k`-th source
+      domain and `C` is the number of classes
     - :math:`\mathbf{h}` is a vector of estimated proportions in the target domain of size `C`
     - :math:`\mathbf{a}` is a uniform vector of weights in the target domain of size `n`
     - :math:`\mathbf{D}_1^{(k)}` is a matrix of class assignments defined as in
-    [p. 5, :ref:`27 <references-jcpot-barycenter>`], its expected shape is :math:`(n_k, C)`
+      [p. 5, :ref:`27 <references-jcpot-barycenter>`], its expected shape is :math:`(n_k, C)`
 
     The problem consist in solving a Wasserstein barycenter problem to estimate
     the proportions :math:`\mathbf{h}` in the target domain.
@@ -2714,18 +2726,19 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \ \gamma \mathbf{1} &= a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T \mathbf{1} &= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
              \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
 
@@ -2900,18 +2913,19 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
+        W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \ \gamma \mathbf{1} &= a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T \mathbf{1} &= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
              \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
 
@@ -3055,18 +3069,21 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
 
     .. math::
 
-        W &= \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
+        W &= \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        W_a &= \min_{\gamma_a} <\gamma_a, \mathbf{M_a}>_F + \mathrm{reg} \cdot\Omega(\gamma_a)
+        W_a &= \min_{\gamma_a} \quad \langle \gamma_a, \mathbf{M_a} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma_a)
 
-        W_b &= \min_{\gamma_b} <\gamma_b, \mathbf{M_b}>_F + \mathrm{reg} \cdot\Omega(\gamma_b)
+        W_b &= \min_{\gamma_b} \quad \langle \gamma_b, \mathbf{M_b} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma_b)
 
         S &= W - \frac{W_a + W_b}{2}
 
     .. math::
-        s.t. \ \gamma \mathbf{1} &= a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T \mathbf{1} &= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
              \gamma &\geq 0
 
@@ -3084,10 +3101,10 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
     where :
 
     - :math:`\mathbf{M}` (resp. :math:`\mathbf{M_a}`, :math:`\mathbf{M_b}`)
-    is the (`n_samples_a`, `n_samples_b`) metric cost matrix
-    (resp (`n_samples_a, n_samples_a`) and (`n_samples_b`, `n_samples_b`))
+      is the (`n_samples_a`, `n_samples_b`) metric cost matrix
+      (resp (`n_samples_a, n_samples_a`) and (`n_samples_b`, `n_samples_b`))
     - :math:`\Omega` is the entropic regularization term
-    :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+      :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
 
@@ -3198,7 +3215,10 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False,
 
     .. math::
 
-        (\mathbf{u}, \mathbf{v}) = \mathop{\arg \min}_{\mathbf{u}, \mathbf{v}} \ \mathbf{1}_{ns}^T \mathbf{B}(\mathbf{u}, \mathbf{v}) \mathbf{1}_{nt} - <\kappa \mathbf{u}, \mathbf{a}> - <\mathbf{v} / \kappa, \mathbf{b}>
+        (\mathbf{u}, \mathbf{v}) = \mathop{\arg \min}_{\mathbf{u}, \mathbf{v}} \quad
+        \mathbf{1}_{ns}^T \mathbf{B}(\mathbf{u}, \mathbf{v}) \mathbf{1}_{nt} -
+        \langle \kappa \mathbf{u}, \mathbf{a} \rangle -
+        \langle \frac{1}{\kappa} \mathbf{v}, \mathbf{b} \rangle
 
     where:
 
@@ -3249,13 +3269,15 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False,
         If `True`, display informations about the cardinals of the active sets
         and the parameters kappa and epsilon
 
-    Dependency
-    ----------
-    To gain more efficiency, screenkhorn needs to call the "Bottleneck"
-    package (https://pypi.org/project/Bottleneck/)
-    in the screening pre-processing step. If Bottleneck isn't installed,
-    the following error message appears:
-    "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/"
+
+    .. admonition:: Dependency
+
+        To gain more efficiency, :py:func:`ot.bregman.screenkhorn` needs to call the "Bottleneck"
+        package (https://pypi.org/project/Bottleneck/) in the screening pre-processing step.
+
+        If Bottleneck isn't installed, the following error message appears:
+
+        "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/"
 
 
     Returns
diff --git a/ot/da.py b/ot/da.py
index cdc747c..4fd97df 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -33,27 +33,29 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)
-        + \eta \Omega_g(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot \Omega_e(\gamma) + \eta \ \Omega_g(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
+
+             \gamma^T \mathbf{1} = \mathbf{b}
+
+             \gamma \geq 0
 
-             \gamma^T 1= b
 
-             \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e
       (\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\Omega_g` is the group lasso  regularization term
       :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
-      where  :math:`\mathcal{I}_c` are the index of samples from class c
+      where  :math:`\mathcal{I}_c` are the index of samples from class `c`
       in the source domain.
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the generalized conditional
-    gradient as proposed in  [5]_ [7]_
+    gradient as proposed in :ref:`[5, 7] <references-sinkhorn-lpl1-mm>`.
 
 
     Parameters
@@ -84,19 +86,20 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-sinkhorn-lpl1-mm:
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
        "Optimal Transport for Domain Adaptation," in IEEE
        Transactions on Pattern Analysis and Machine Intelligence ,
        vol.PP, no.99, pp.1-1
+
     .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence
        and applications. arXiv preprint arXiv:1510.06567.
@@ -144,27 +147,29 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+
-        \eta \Omega_g(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot \Omega_e(\gamma) + \eta \ \Omega_g(\gamma)
+
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-        s.t. \gamma 1 = a
+             \gamma^T \mathbf{1} = \mathbf{b}
+
+             \gamma \geq 0
 
-             \gamma^T 1= b
 
-             \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega_e` is the entropic regularization term
       :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
     - :math:`\Omega_g` is the group lasso regulaization term
       :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^2`
       where  :math:`\mathcal{I}_c` are the index of samples from class
-      c in the source domain.
-    - a and b are source and target weights (sum to 1)
+      `c` in the source domain.
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the generalised conditional
-    gradient as proposed in  [5]_ [7]_
+    gradient as proposed in :ref:`[5, 7] <references-sinkhorn-l1l2-gl>`.
 
 
     Parameters
@@ -195,18 +200,19 @@ def sinkhorn_l1l2_gl(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-sinkhorn-l1l2-gl:
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
        "Optimal Transport for Domain Adaptation," in IEEE Transactions
        on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+
     .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence and
        applications. arXiv preprint arXiv:1510.06567.
@@ -245,38 +251,40 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False,
                             verbose2=False, numItermax=100, numInnerItermax=10,
                             stopInnerThr=1e-6, stopThr=1e-5, log=False,
                             **kwargs):
-    r"""Joint OT and linear mapping estimation as proposed in [8]
+    r"""Joint OT and linear mapping estimation as proposed in
+    :ref:`[8] <references-joint-OT-mapping-linear>`.
 
     The function solves the following optimization problem:
 
     .. math::
-        \min_{\gamma,L}\quad \|L(X_s) -n_s\gamma X_t\|^2_F +
-          \mu<\gamma,M>_F + \eta  \|L -I\|^2_F
+        \min_{\gamma,L}\quad \|L(\mathbf{X_s}) - n_s\gamma \mathbf{X_t} \|^2_F +
+          \mu \langle \gamma, \mathbf{M} \rangle_F + \eta \|L - \mathbf{I}\|^2_F
+
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-        s.t. \gamma 1 = a
+             \gamma^T \mathbf{1} = \mathbf{b}
 
-             \gamma^T 1= b
+             \gamma \geq 0
 
-             \gamma\geq 0
     where :
 
-    - M is the (ns,nt) squared euclidean cost matrix between samples in
-       Xs and Xt (scaled by ns)
-    - :math:`L` is a dxd linear operator that approximates the barycentric
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) squared euclidean cost matrix between samples in
+      :math:`\mathbf{X_s}` and :math:`\mathbf{X_t}` (scaled by :math:`n_s`)
+    - :math:`L` is a :math:`d\times d` linear operator that approximates the barycentric
       mapping
-    - :math:`I` is the identity matrix (neutral linear mapping)
-    - a and b are uniform source and target weights
+    - :math:`\mathbf{I}` is the identity matrix (neutral linear mapping)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are uniform source and target weights
 
     The problem consist in solving jointly an optimal transport matrix
     :math:`\gamma` and a linear mapping that fits the barycentric mapping
-    :math:`n_s\gamma X_t`.
+    :math:`n_s\gamma \mathbf{X_t}`.
 
     One can also estimate a mapping with constant bias (see supplementary
-    material of [8]) using the bias optional argument.
+    material of :ref:`[8] <references-joint-OT-mapping-linear>`) using the bias optional argument.
 
     The algorithm used for solving the problem is the block coordinate
-    descent that alternates between updates of G (using conditionnal gradient)
-    and the update of L using a classical least square solver.
+    descent that alternates between updates of :math:`\mathbf{G}` (using conditionnal gradient)
+    and the update of :math:`\mathbf{L}` using a classical least square solver.
 
 
     Parameters
@@ -307,17 +315,17 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
-    L : (d x d) ndarray
-        Linear mapping matrix (d+1 x d if bias)
+    L : (d, d) ndarray
+        Linear mapping matrix ((:math:`d+1`, `d`) if bias)
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-joint-OT-mapping-linear:
     References
     ----------
-
     .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
         "Mapping estimation for discrete optimal transport",
         Neural Information Processing Systems (NIPS), 2016.
@@ -434,37 +442,41 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
                             numItermax=100, numInnerItermax=10,
                             stopInnerThr=1e-6, stopThr=1e-5, log=False,
                             **kwargs):
-    r"""Joint OT and nonlinear mapping estimation with kernels as proposed in [8]
+    r"""Joint OT and nonlinear mapping estimation with kernels as proposed in
+    :ref:`[8] <references-joint-OT-mapping-kernel>`.
 
     The function solves the following optimization problem:
 
     .. math::
-        \min_{\gamma,L\in\mathcal{H}}\quad \|L(X_s) -
-        n_s\gamma X_t\|^2_F + \mu<\gamma,M>_F + \eta  \|L\|^2_\mathcal{H}
+        \min_{\gamma, L\in\mathcal{H}}\quad \|L(\mathbf{X_s}) -
+        n_s\gamma \mathbf{X_t}\|^2_F + \mu \langle \gamma, \mathbf{M} \rangle_F +
+        \eta \|L\|^2_\mathcal{H}
+
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
+
+             \gamma^T \mathbf{1} = \mathbf{b}
 
-        s.t. \gamma 1 = a
+             \gamma \geq 0
 
-             \gamma^T 1= b
 
-             \gamma\geq 0
     where :
 
-    - M is the (ns,nt) squared euclidean cost matrix between samples in
-      Xs and Xt (scaled by ns)
-    - :math:`L` is a ns x d linear operator on a kernel matrix that
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) squared euclidean cost matrix between samples in
+      :math:`\mathbf{X_s}` and :math:`\mathbf{X_t}` (scaled by :math:`n_s`)
+    - :math:`L` is a :math:`n_s \times d` linear operator on a kernel matrix that
       approximates the barycentric mapping
-    - a and b are uniform source and target weights
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are uniform source and target weights
 
     The problem consist in solving jointly an optimal transport matrix
     :math:`\gamma` and the nonlinear mapping that fits the barycentric mapping
-    :math:`n_s\gamma X_t`.
+    :math:`n_s\gamma \mathbf{X_t}`.
 
     One can also estimate a mapping with constant bias (see supplementary
-    material of [8]) using the bias optional argument.
+    material of :ref:`[8] <references-joint-OT-mapping-kernel>`) using the bias optional argument.
 
     The algorithm used for solving the problem is the block coordinate
-    descent that alternates between updates of G (using conditionnal gradient)
-    and the update of L using a classical kernel least square solver.
+    descent that alternates between updates of :math:`\mathbf{G}` (using conditionnal gradient)
+    and the update of :math:`\mathbf{L}` using a classical kernel least square solver.
 
 
     Parameters
@@ -478,7 +490,7 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
     eta : float, optional
         Regularization term  for the linear mapping L (>0)
     kerneltype : str,optional
-        kernel used by calling function ot.utils.kernel (gaussian by default)
+        kernel used by calling function :py:func:`ot.utils.kernel` (gaussian by default)
     sigma : float, optional
         Gaussian kernel bandwidth.
     bias : bool,optional
@@ -501,17 +513,17 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
-    L : (ns x d) ndarray
-        Nonlinear mapping matrix (ns+1 x d if bias)
+    L : (ns, d) ndarray
+        Nonlinear mapping matrix ((:math:`n_s+1`, `d`) if bias)
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-joint-OT-mapping-kernel:
     References
     ----------
-
     .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
        "Mapping estimation for discrete optimal transport",
        Neural Information Processing Systems (NIPS), 2016.
@@ -645,26 +657,27 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
 
 def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
                       wt=None, bias=True, log=False):
-    r""" return OT linear operator between samples
+    r"""Return OT linear operator between samples.
 
     The function estimates the optimal linear operator that aligns the two
     empirical distributions. This is equivalent to estimating the closed
-    form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)`
-    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark
-    2.29 in [15].
+    form mapping between two Gaussian distributions :math:`\mathcal{N}(\mu_s,\Sigma_s)`
+    and :math:`\mathcal{N}(\mu_t,\Sigma_t)` as proposed in
+    :ref:`[14] <references-OT-mapping-linear>` and discussed in remark 2.29 in
+    :ref:`[15] <references-OT-mapping-linear>`.
 
     The linear operator from source to target :math:`M`
 
     .. math::
-        M(x)=Ax+b
+        M(\mathbf{x})= \mathbf{A} \mathbf{x} + \mathbf{b}
 
     where :
 
     .. math::
-        A=\Sigma_s^{-1/2}(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2})^{1/2}
+        \mathbf{A} &= \Sigma_s^{-1/2} \left(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2} \right)^{1/2}
         \Sigma_s^{-1/2}
-    .. math::
-        b=\mu_t-A\mu_s
+
+        \mathbf{b} &= \mu_t - \mathbf{A} \mu_s
 
     Parameters
     ----------
@@ -673,35 +686,35 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
     xt : np.ndarray (nt,d)
         samples in the target domain
     reg : float,optional
-        regularization added to the diagonals of convariances (>0)
+        regularization added to the diagonals of covariances (>0)
     ws : np.ndarray (ns,1), optional
         weights for the source samples
     wt : np.ndarray (ns,1), optional
         weights for the target samples
     bias: boolean, optional
-        estimate bias b else b=0 (default:True)
+        estimate bias :math:`\mathbf{b}` else :math:`\mathbf{b} = 0` (default:True)
     log : bool, optional
         record log if True
 
 
     Returns
     -------
-    A : (d x d) ndarray
+    A : (d, d) ndarray
         Linear operator
-    b : (1 x d) ndarray
+    b : (1, d) ndarray
         bias
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-OT-mapping-linear:
     References
     ----------
-
     .. [14] Knott, M. and Smith, C. S. "On the optimal mapping of
         distributions", Journal of Optimization Theory and Applications
         Vol 43, 1984
 
-    .. [15]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+    .. [15] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
         Transport", 2018.
 
 
@@ -754,24 +767,34 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al
     r"""Solve the optimal transport problem (OT) with Laplacian regularization
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + eta\Omega_\alpha(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \eta \cdot \Omega_\alpha(\gamma)
 
-        s.t.\ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
-             \gamma\geq 0
+             \gamma \geq 0
 
     where:
 
-    - a and b are source and target weights (sum to 1)
-    - xs and xt are source and target samples
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
+    - :math:`\mathbf{x_s}` and :math:`\mathbf{x_t}` are source and target samples
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega_\alpha` is the Laplacian regularization term
-      :math:`\Omega_\alpha = (1-\alpha)/n_s^2\sum_{i,j}S^s_{i,j}\|T(\mathbf{x}^s_i)-T(\mathbf{x}^s_j)\|^2+\alpha/n_t^2\sum_{i,j}S^t_{i,j}^'\|T(\mathbf{x}^t_i)-T(\mathbf{x}^t_j)\|^2`
-      with :math:`S^s_{i,j}, S^t_{i,j}` denoting source and target similarity matrices and :math:`T(\cdot)` being a barycentric mapping
 
-    The algorithm used for solving the problem is the conditional gradient algorithm as proposed in [5].
+    .. math::
+        \Omega_\alpha = \frac{1 - \alpha}{n_s^2} \sum_{i,j}
+        \mathbf{S^s}_{i,j} \|T(\mathbf{x}^s_i) - T(\mathbf{x}^s_j) \|^2 +
+        \frac{\alpha}{n_t^2} \sum_{i,j}
+        \mathbf{S^t}_{i,j} \|T(\mathbf{x}^t_i) - T(\mathbf{x}^t_j) \|^2
+
+
+    with :math:`\mathbf{S^s}_{i,j}, \mathbf{S^t}_{i,j}` denoting source and target similarity
+    matrices and :math:`T(\cdot)` being a barycentric mapping.
+
+    The algorithm used for solving the problem is the conditional gradient algorithm as proposed in
+    :ref:`[5] <references-emd-laplace>`.
 
     Parameters
     ----------
@@ -811,22 +834,23 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-emd-laplace:
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
        "Optimal Transport for Domain Adaptation," in IEEE
-       Transactions on Pattern Analysis and Machine Intelligence ,
+       Transactions on Pattern Analysis and Machine Intelligence,
        vol.PP, no.99, pp.1-1
+
     .. [30] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy,
         "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching,"
-         in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
+        in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
 
     See Also
     --------
@@ -882,7 +906,7 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al
 
 
 def distribution_estimation_uniform(X):
-    """estimates a uniform distribution from an array of samples X
+    """estimates a uniform distribution from an array of samples :math:`\mathbf{X}`
 
     Parameters
     ----------
@@ -892,7 +916,7 @@ def distribution_estimation_uniform(X):
     Returns
     -------
     mu : array-like, shape (n_samples,)
-        The uniform distribution estimated from X
+        The uniform distribution estimated from :math:`\mathbf{X}`
     """
 
     return unif(X.shape[0])
@@ -902,32 +926,32 @@ class BaseTransport(BaseEstimator):
 
     """Base class for OTDA objects
 
-    Notes
-    -----
-    All estimators should specify all the parameters that can be set
-    at the class level in their ``__init__`` as explicit keyword
-    arguments (no ``*args`` or ``**kwargs``).
+    .. note::
+        All estimators should specify all the parameters that can be set
+        at the class level in their ``__init__`` as explicit keyword
+        arguments (no ``*args`` or ``**kwargs``).
 
-    the fit method should:
+    The fit method should:
 
     - estimate a cost matrix and store it in a `cost_` attribute
-    - estimate a coupling matrix and store it in a `coupling_`
-    attribute
+    - estimate a coupling matrix and store it in a `coupling_` attribute
     - estimate distributions from source and target data and store them in
-    mu_s and mu_t attributes
-    - store Xs and Xt in attributes to be used later on in transform and
-    inverse_transform methods
+      `mu_s` and `mu_t` attributes
+    - store `Xs` and `Xt` in attributes to be used later on in `transform` and
+      `inverse_transform` methods
+
+    `transform` method should always get as input a `Xs` parameter
 
-    transform method should always get as input a Xs parameter
-    inverse_transform method should always get as input a Xt parameter
+    `inverse_transform` method should always get as input a `Xt` parameter
 
-    transform_labels method should always get as input a ys parameter
-    inverse_transform_labels method should always get as input a yt parameter
+    `transform_labels` method should always get as input a `ys` parameter
+
+    `inverse_transform_labels` method should always get as input a `yt` parameter
     """
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -938,8 +962,8 @@ class BaseTransport(BaseEstimator):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -987,8 +1011,8 @@ class BaseTransport(BaseEstimator):
 
     def fit_transform(self, Xs=None, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt) and transports source samples Xs onto target
-        ones Xt
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
+        and transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -999,8 +1023,8 @@ class BaseTransport(BaseEstimator):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1014,7 +1038,7 @@ class BaseTransport(BaseEstimator):
         return self.fit(Xs, ys, Xt, yt).transform(Xs, ys, Xt, yt)
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples Xs onto target ones Xt
+        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -1025,8 +1049,8 @@ class BaseTransport(BaseEstimator):
         Xt : array-like, shape (n_target_samples, n_features)
             The target input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels for target. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels for target. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1081,7 +1105,8 @@ class BaseTransport(BaseEstimator):
             return transp_Xs
 
     def transform_labels(self, ys=None):
-        """Propagate source labels ys to obtain estimated target labels as in [27]
+        """Propagate source labels :math:`\mathbf{y_s}` to obtain estimated target labels as in
+        :ref:`[27] <references-basetransport-transform-labels>`.
 
         Parameters
         ----------
@@ -1093,9 +1118,10 @@ class BaseTransport(BaseEstimator):
         transp_ys : array-like, shape (n_target_samples, nb_classes)
             Estimated soft target labels.
 
+
+        .. _references-basetransport-transform-labels:
         References
         ----------
-
         .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
            "Optimal transport for multi-source domain adaptation under target shift",
            International Conference on Artificial Intelligence and Statistics (AISTATS), 2019.
@@ -1126,7 +1152,7 @@ class BaseTransport(BaseEstimator):
 
     def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
                           batch_size=128):
-        """Transports target samples Xt onto source samples Xs
+        """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
 
         Parameters
         ----------
@@ -1137,8 +1163,8 @@ class BaseTransport(BaseEstimator):
         Xt : array-like, shape (n_target_samples, n_features)
             The target input samples.
         yt : array-like, shape (n_target_samples,)
-            The target class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The target class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1192,7 +1218,8 @@ class BaseTransport(BaseEstimator):
             return transp_Xt
 
     def inverse_transform_labels(self, yt=None):
-        """Propagate target labels yt to obtain estimated source labels ys
+        """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
+        :math:`\mathbf{y_s}`
 
         Parameters
         ----------
@@ -1232,35 +1259,37 @@ class LinearTransport(BaseTransport):
 
     The function estimates the optimal linear operator that aligns the two
     empirical distributions. This is equivalent to estimating the closed
-    form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)`
-    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in
-    remark 2.29 in [15].
+    form mapping between two Gaussian distributions :math:`\mathcal{N}(\mu_s,\Sigma_s)`
+    and :math:`\mathcal{N}(\mu_t,\Sigma_t)` as proposed in
+    :ref:`[14] <references-lineartransport>` and discussed in remark 2.29 in
+    :ref:`[15] <references-lineartransport>`.
 
     The linear operator from source to target :math:`M`
 
     .. math::
-        M(x)=Ax+b
+        M(\mathbf{x})= \mathbf{A} \mathbf{x} + \mathbf{b}
 
     where :
 
     .. math::
-        A=\Sigma_s^{-1/2}(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2})^{1/2}
+        \mathbf{A} &= \Sigma_s^{-1/2} \left(\Sigma_s^{1/2}\Sigma_t\Sigma_s^{1/2} \right)^{1/2}
         \Sigma_s^{-1/2}
-    .. math::
-        b=\mu_t-A\mu_s
+
+        \mathbf{b} &= \mu_t - \mathbf{A} \mu_s
 
     Parameters
     ----------
     reg : float,optional
-        regularization added to the daigonals of convariances (>0)
+        regularization added to the daigonals of covariances (>0)
     bias: boolean, optional
-        estimate bias b else b=0 (default:True)
+        estimate bias :math:`\mathbf{b}` else :math:`\mathbf{b} = 0` (default:True)
     log : bool, optional
         record log if True
 
+
+    .. _references-lineartransport:
     References
     ----------
-
     .. [14] Knott, M. and Smith, C. S. "On the optimal mapping of
         distributions", Journal of Optimization Theory and Applications
         Vol 43, 1984
@@ -1279,7 +1308,7 @@ class LinearTransport(BaseTransport):
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1290,8 +1319,8 @@ class LinearTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1325,7 +1354,7 @@ class LinearTransport(BaseTransport):
         return self
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples Xs onto target ones Xt
+        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -1336,8 +1365,8 @@ class LinearTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1358,7 +1387,7 @@ class LinearTransport(BaseTransport):
 
     def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
                           batch_size=128):
-        """Transports target samples Xt onto target samples Xs
+        """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
 
         Parameters
         ----------
@@ -1369,8 +1398,8 @@ class LinearTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1392,7 +1421,7 @@ class LinearTransport(BaseTransport):
 
 class SinkhornTransport(BaseTransport):
 
-    """Domain Adapatation OT method based on Sinkhorn Algorithm
+    """Domain Adaptation OT method based on Sinkhorn Algorithm
 
     Parameters
     ----------
@@ -1400,7 +1429,7 @@ class SinkhornTransport(BaseTransport):
         Entropic regularization parameter
     max_iter : int, float, optional (default=1000)
         The minimum number of iteration before stopping the optimization
-        algorithm if no it has not converged
+        algorithm if it has not converged
     tol : float, optional (default=10e-9)
         The precision required to stop the optimization algorithm.
     verbose : bool, optional (default=False)
@@ -1417,8 +1446,8 @@ class SinkhornTransport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
-    limit_max: float, optional (defaul=np.infty)
+        "ferradans" which uses the method proposed in :ref:`[6] <references-sinkhorntransport>`.
+    limit_max: float, optional (default=np.infty)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an cost defined
         by this variable
@@ -1428,16 +1457,20 @@ class SinkhornTransport(BaseTransport):
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
     log_ : dictionary
-        The dictionary of log, empty dic if parameter log is not True
+        The dictionary of log, empty dict if parameter log is not True
 
+
+    .. _references-sinkhorntransport:
     References
     ----------
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
            "Optimal Transport for Domain Adaptation," in IEEE Transactions
            on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
            Transport, Advances in Neural Information Processing Systems (NIPS)
            26, 2013
+
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
             Regularized discrete optimal transport. SIAM Journal on Imaging
             Sciences, 7(3), 1853-1882.
@@ -1461,7 +1494,7 @@ class SinkhornTransport(BaseTransport):
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1472,8 +1505,8 @@ class SinkhornTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1504,7 +1537,7 @@ class SinkhornTransport(BaseTransport):
 
 class EMDTransport(BaseTransport):
 
-    """Domain Adapatation OT method based on Earth Mover's Distance
+    """Domain Adaptation OT method based on Earth Mover's Distance
 
     Parameters
     ----------
@@ -1520,7 +1553,7 @@ class EMDTransport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
+        "ferradans" which uses the method proposed in :ref:`[6] <references-emdtransport>`.
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1534,14 +1567,16 @@ class EMDTransport(BaseTransport):
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
 
+
+    .. _references-emdtransport:
     References
     ----------
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
-           "Optimal Transport for Domain Adaptation," in IEEE Transactions
-           on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+        "Optimal Transport for Domain Adaptation," in IEEE Transactions
+        on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
-            Regularized discrete optimal transport. SIAM Journal on Imaging
-            Sciences, 7(3), 1853-1882.
+        Regularized discrete optimal transport. SIAM Journal on Imaging
+        Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, metric="sqeuclidean", norm=None, log=False,
@@ -1558,7 +1593,7 @@ class EMDTransport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1569,8 +1604,8 @@ class EMDTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1597,8 +1632,7 @@ class EMDTransport(BaseTransport):
 
 
 class SinkhornLpl1Transport(BaseTransport):
-
-    """Domain Adapatation OT method based on sinkhorn algorithm +
+    r"""Domain Adaptation OT method based on sinkhorn algorithm +
     LpL1 class regularization.
 
     Parameters
@@ -1609,7 +1643,7 @@ class SinkhornLpl1Transport(BaseTransport):
         Class regularization parameter
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
-        algorithm if no it has not converged
+        algorithm if it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
     log : bool, optional (default=False)
@@ -1628,8 +1662,8 @@ class SinkhornLpl1Transport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
-    limit_max: float, optional (defaul=np.infty)
+        "ferradans" which uses the method proposed in :ref:`[6] <references-sinkhornlpl1transport>`.
+    limit_max: float, optional (default=np.infty)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit a cost defined by
         limit_max.
@@ -1639,16 +1673,19 @@ class SinkhornLpl1Transport(BaseTransport):
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
 
+
+    .. _references-sinkhornlpl1transport:
     References
     ----------
-
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
        "Optimal Transport for Domain Adaptation," in IEEE
        Transactions on Pattern Analysis and Machine Intelligence ,
        vol.PP, no.99, pp.1-1
+
     .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence
        and applications. arXiv preprint arXiv:1510.06567.
+
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
             Regularized discrete optimal transport. SIAM Journal on Imaging
             Sciences, 7(3), 1853-1882.
@@ -1675,7 +1712,7 @@ class SinkhornLpl1Transport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1686,8 +1723,8 @@ class SinkhornLpl1Transport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1719,13 +1756,14 @@ class SinkhornLpl1Transport(BaseTransport):
 
 class EMDLaplaceTransport(BaseTransport):
 
-    """Domain Adapatation OT method based on Earth Mover's Distance with Laplacian regularization
+    """Domain Adaptation OT method based on Earth Mover's Distance with Laplacian regularization
 
     Parameters
     ----------
     reg_type : string optional (default='pos')
         Type of the regularization term: 'pos' and 'disp' for
-        regularization term defined in [2] and [6], respectively.
+        regularization term defined in :ref:`[2] <references-emdlaplacetransport>` and
+        :ref:`[6] <references-emdlaplacetransport>`, respectively.
     reg_lap : float, optional (default=1)
         Laplacian regularization parameter
     reg_src : float, optional (default=0.5)
@@ -1756,24 +1794,27 @@ class EMDLaplaceTransport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
+        "ferradans" which uses the method proposed in :ref:`[6] <references-emdlaplacetransport>`.
 
     Attributes
     ----------
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
 
+
+    .. _references-emdlaplacetransport:
     References
     ----------
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
            "Optimal Transport for Domain Adaptation," in IEEE Transactions
            on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+
     .. [2] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy,
         "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching,"
-         in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
+        in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
+
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
-            Regularized discrete optimal transport. SIAM Journal on Imaging
-            Sciences, 7(3), 1853-1882.
+        Regularized discrete optimal transport. SIAM Journal on Imaging Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, reg_type='pos', reg_lap=1., reg_src=1., metric="sqeuclidean",
@@ -1799,7 +1840,7 @@ class EMDLaplaceTransport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1810,8 +1851,8 @@ class EMDLaplaceTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1840,8 +1881,8 @@ class EMDLaplaceTransport(BaseTransport):
 
 class SinkhornL1l2Transport(BaseTransport):
 
-    """Domain Adapatation OT method based on sinkhorn algorithm +
-    l1l2 class regularization.
+    """Domain Adaptation OT method based on sinkhorn algorithm +
+    L1L2 class regularization.
 
     Parameters
     ----------
@@ -1851,7 +1892,7 @@ class SinkhornL1l2Transport(BaseTransport):
         Class regularization parameter
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
-        algorithm if no it has not converged
+        algorithm if it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
     tol : float, optional (default=10e-9)
@@ -1870,7 +1911,7 @@ class SinkhornL1l2Transport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
+        "ferradans" which uses the method proposed in :ref:`[6] <references-sinkhornl1l2transport>`.
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1881,18 +1922,21 @@ class SinkhornL1l2Transport(BaseTransport):
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
     log_ : dictionary
-        The dictionary of log, empty dic if parameter log is not True
+        The dictionary of log, empty dict if parameter log is not True
 
+
+    .. _references-sinkhornl1l2transport:
     References
     ----------
-
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
        "Optimal Transport for Domain Adaptation," in IEEE
        Transactions on Pattern Analysis and Machine Intelligence ,
        vol.PP, no.99, pp.1-1
+
     .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence
        and applications. arXiv preprint arXiv:1510.06567.
+
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
             Regularized discrete optimal transport. SIAM Journal on Imaging
             Sciences, 7(3), 1853-1882.
@@ -1919,7 +1963,7 @@ class SinkhornL1l2Transport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -1930,8 +1974,8 @@ class SinkhornL1l2Transport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -1973,7 +2017,7 @@ class MappingTransport(BaseEstimator):
     mu : float, optional (default=1)
         Weight for the linear OT loss (>0)
     eta : float, optional (default=0.001)
-        Regularization term for the linear mapping L (>0)
+        Regularization term for the linear mapping `L` (>0)
     bias : bool, optional (default=False)
         Estimate linear mapping with constant bias
     metric : string, optional (default="sqeuclidean")
@@ -2004,17 +2048,20 @@ class MappingTransport(BaseEstimator):
     ----------
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
-    mapping_ : array-like, shape (n_features (+ 1), n_features)
-        (if bias) for kernel == linear
+    mapping_ :
         The associated mapping
-        array-like, shape (n_source_samples (+ 1), n_features)
-        (if bias) for kernel == gaussian
+
+        - array-like, shape (`n_features` (+ 1), `n_features`),
+          (if bias) for kernel == linear
+
+        - array-like, shape (`n_source_samples` (+ 1), `n_features`),
+          (if bias) for kernel == gaussian
     log_ : dictionary
-        The dictionary of log, empty dic if parameter log is not True
+        The dictionary of log, empty dict if parameter log is not True
+
 
     References
     ----------
-
     .. [8] M. Perrot, N. Courty, R. Flamary, A. Habrard,
             "Mapping estimation for discrete optimal transport",
             Neural Information Processing Systems (NIPS), 2016.
@@ -2042,7 +2089,8 @@ class MappingTransport(BaseEstimator):
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
         """Builds an optimal coupling and estimates the associated mapping
-        from source and target sets of samples (Xs, ys) and (Xt, yt)
+        from source and target sets of samples
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -2053,8 +2101,8 @@ class MappingTransport(BaseEstimator):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -2098,7 +2146,7 @@ class MappingTransport(BaseEstimator):
         return self
 
     def transform(self, Xs):
-        """Transports source samples Xs onto target ones Xt
+        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -2138,7 +2186,7 @@ class MappingTransport(BaseEstimator):
 
 class UnbalancedSinkhornTransport(BaseTransport):
 
-    """Domain Adapatation unbalanced OT method based on sinkhorn algorithm
+    """Domain Adaptation unbalanced OT method based on sinkhorn algorithm
 
     Parameters
     ----------
@@ -2151,7 +2199,7 @@ class UnbalancedSinkhornTransport(BaseTransport):
         'sinkhorn_epsilon_scaling', see those function for specific parameters
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
-        algorithm if no it has not converged
+        algorithm if it has not converged
     tol : float, optional (default=10e-9)
         Stop threshold on error (inner sinkhorn solver) (>0)
     verbose : bool, optional (default=False)
@@ -2168,7 +2216,7 @@ class UnbalancedSinkhornTransport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
+        "ferradans" which uses the method proposed in :ref:`[6] <references-unbalancedsinkhorntransport>`.
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -2179,14 +2227,16 @@ class UnbalancedSinkhornTransport(BaseTransport):
     coupling_ : array-like, shape (n_source_samples, n_target_samples)
         The optimal coupling
     log_ : dictionary
-        The dictionary of log, empty dic if parameter log is not True
+        The dictionary of log, empty dict if parameter log is not True
+
 
+    .. _references-unbalancedsinkhorntransport:
     References
     ----------
-
     .. [1] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
-    Scaling algorithms for unbalanced transport problems. arXiv preprint
-    arXiv:1607.05816.
+        Scaling algorithms for unbalanced transport problems. arXiv preprint
+        arXiv:1607.05816.
+
     .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
             Regularized discrete optimal transport. SIAM Journal on Imaging
             Sciences, 7(3), 1853-1882.
@@ -2212,7 +2262,7 @@ class UnbalancedSinkhornTransport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Build a coupling matrix from source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -2223,8 +2273,8 @@ class UnbalancedSinkhornTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -2258,7 +2308,7 @@ class UnbalancedSinkhornTransport(BaseTransport):
 
 class JCPOTTransport(BaseTransport):
 
-    """Domain Adapatation OT method for multi-source target shift based on Wasserstein barycenter algorithm.
+    """Domain Adaptation OT method for multi-source target shift based on Wasserstein barycenter algorithm.
 
     Parameters
     ----------
@@ -2266,7 +2316,7 @@ class JCPOTTransport(BaseTransport):
         Entropic regularization parameter
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
-        algorithm if no it has not converged
+        algorithm if it has not converged
     tol : float, optional (default=10e-9)
         Stop threshold on error (inner sinkhorn solver) (>0)
     verbose : bool, optional (default=False)
@@ -2283,7 +2333,7 @@ class JCPOTTransport(BaseTransport):
     out_of_sample_map : string, optional (default="ferradans")
         The kind of out of sample mapping to apply to transport samples
         from a domain into another one. Currently the only possible option is
-        "ferradans" which uses the method proposed in [6].
+        "ferradans" which uses the method proposed in :ref:`[6] <references-jcpottransport>`.
 
     Attributes
     ----------
@@ -2292,11 +2342,12 @@ class JCPOTTransport(BaseTransport):
     proportions_ : array-like, shape (n_classes,)
         Estimated class proportions in the target domain
     log_ : dictionary
-        The dictionary of log, empty dic if parameter log is not True
+        The dictionary of log, empty dict if parameter log is not True
 
+
+    .. _references-jcpottransport:
     References
     ----------
-
     .. [1] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
        "Optimal transport for multi-source domain adaptation under target shift",
        International Conference on Artificial Intelligence and Statistics (AISTATS),
@@ -2323,7 +2374,7 @@ class JCPOTTransport(BaseTransport):
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
         """Building coupling matrices from a list of source and target sets of samples
-        (Xs, ys) and (Xt, yt)
+        :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
         ----------
@@ -2334,8 +2385,8 @@ class JCPOTTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -2368,7 +2419,7 @@ class JCPOTTransport(BaseTransport):
         return self
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples Xs onto target ones Xt
+        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -2379,8 +2430,8 @@ class JCPOTTransport(BaseTransport):
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
-            yt's elements with -1.
+            The class labels. If some target samples are unlabelled, fill the
+            :math:`\mathbf{y_t}`'s elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
             class label
@@ -2440,7 +2491,8 @@ class JCPOTTransport(BaseTransport):
             return transp_Xs
 
     def transform_labels(self, ys=None):
-        """Propagate source labels ys to obtain target labels as in [27]
+        """Propagate source labels :math:`\mathbf{y_s}` to obtain target labels as in
+        :ref:`[27] <references-jcpottransport-transform-labels>`
 
         Parameters
         ----------
@@ -2451,6 +2503,14 @@ class JCPOTTransport(BaseTransport):
         -------
         yt : array-like, shape (n_target_samples, nb_classes)
             Estimated soft target labels.
+
+
+        .. _references-jcpottransport-transform-labels:
+        References
+        ----------
+        .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
+           "Optimal transport for multi-source domain adaptation under target shift",
+           International Conference on Artificial Intelligence and Statistics (AISTATS), 2019.
         """
 
         # check the necessary inputs parameters are here
@@ -2482,11 +2542,12 @@ class JCPOTTransport(BaseTransport):
             return yt.T
 
     def inverse_transform_labels(self, yt=None):
-        """Propagate source labels ys to obtain target labels
+        """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
+        :math:`\mathbf{y_s}`
 
         Parameters
         ----------
-        yt : array-like, shape (n_source_samples,)
+        yt : array-like, shape (n_target_samples,)
             The target class labels
 
         Returns
diff --git a/ot/datasets.py b/ot/datasets.py
index b86ef3b..ad6390c 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -13,7 +13,7 @@ from .utils import check_random_state, deprecated
 
 
 def make_1D_gauss(n, m, s):
-    """return a 1D histogram for a gaussian distribution (n bins, mean m and std s)
+    """return a 1D histogram for a gaussian distribution (`n` bins, mean `m` and std `s`)
 
     Parameters
     ----------
@@ -26,7 +26,7 @@ def make_1D_gauss(n, m, s):
 
     Returns
     -------
-    h : ndarray (n,)
+    h : ndarray (`n`,)
         1D histogram for a gaussian distribution
     """
     x = np.arange(n, dtype=np.float64)
@@ -41,7 +41,7 @@ def get_1D_gauss(n, m, sigma):
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
-    """Return n samples drawn from 2D gaussian N(m,sigma)
+    """Return `n` samples drawn from 2D gaussian :math:`\mathcal{N}(m, \sigma)`
 
     Parameters
     ----------
@@ -59,8 +59,8 @@ def make_2D_samples_gauss(n, m, sigma, random_state=None):
 
     Returns
     -------
-    X : ndarray, shape (n, 2)
-        n samples drawn from N(m, sigma).
+    X : ndarray, shape (`n`, 2)
+        n samples drawn from :math:`\mathcal{N}(m, \sigma)`.
     """
 
     generator = check_random_state(random_state)
@@ -102,7 +102,7 @@ def make_data_classif(dataset, n, nz=.5, theta=0, p=.5, random_state=None, **kwa
     Returns
     -------
     X : ndarray, shape (n, d)
-        n observation of size d
+        `n` observation of size `d`
     y : ndarray, shape (n,)
         labels of the samples.
     """
diff --git a/ot/dr.py b/ot/dr.py
index 7469270..c2f51f8 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -22,7 +22,7 @@ from pymanopt.solvers import SteepestDescent, TrustRegions
 
 
 def dist(x1, x2):
-    """ Compute squared euclidean distance between samples (autograd)
+    r""" Compute squared euclidean distance between samples (autograd)
     """
     x1p2 = np.sum(np.square(x1), 1)
     x2p2 = np.sum(np.square(x2), 1)
@@ -30,7 +30,7 @@ def dist(x1, x2):
 
 
 def sinkhorn(w1, w2, M, reg, k):
-    """Sinkhorn algorithm with fixed number of iteration (autograd)
+    r"""Sinkhorn algorithm with fixed number of iteration (autograd)
     """
     K = np.exp(-M / reg)
     ui = np.ones((M.shape[0],))
@@ -43,14 +43,14 @@ def sinkhorn(w1, w2, M, reg, k):
 
 
 def split_classes(X, y):
-    """split samples in X by classes in y
+    r"""split samples in :math:`\mathbf{X}` by classes in :math:`\mathbf{y}`
     """
     lstsclass = np.unique(y)
     return [X[y == i, :].astype(np.float32) for i in lstsclass]
 
 
 def fda(X, y, p=2, reg=1e-16):
-    """Fisher Discriminant Analysis
+    r"""Fisher Discriminant Analysis
 
     Parameters
     ----------
@@ -111,18 +111,19 @@ def fda(X, y, p=2, reg=1e-16):
 
 def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, normalize=False):
     r"""
-    Wasserstein Discriminant Analysis [11]_
+    Wasserstein Discriminant Analysis :ref:`[11] <references-wda>`
 
     The function solves the following optimization problem:
 
     .. math::
-        P = \\text{arg}\min_P \\frac{\\sum_i W(PX^i,PX^i)}{\\sum_{i,j\\neq i} W(PX^i,PX^j)}
+        \mathbf{P} = \mathop{\arg \min}_\mathbf{P} \quad
+        \frac{\sum\limits_i W(P \mathbf{X}^i, P \mathbf{X}^i)}{\sum\limits_{i, j \neq i} W(P \mathbf{X}^i, P \mathbf{X}^j)}
 
     where :
 
-    - :math:`P` is a linear projection operator in the Stiefel(p,d) manifold
+    - :math:`P` is a linear projection operator in the Stiefel(`p`, `d`) manifold
     - :math:`W` is entropic regularized Wasserstein distances
-    - :math:`X^i` are samples in the dataset corresponding to class i
+    - :math:`\mathbf{X}^i` are samples in the dataset corresponding to class i
 
     Parameters
     ----------
@@ -140,7 +141,7 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no
     P0 : ndarray, shape (d, p)
         Initial starting point for projection.
     normalize : bool, optional
-        Normalise the Wasserstaiun distane by the average distance on P0 (default : False)
+        Normalise the Wasserstaiun distance by the average distance on P0 (default : False)
     verbose : int, optional
         Print information along iterations.
 
@@ -151,6 +152,8 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no
     proj : callable
         Projection function including mean centering.
 
+
+    .. _references-wda:
     References
     ----------
     .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016).
@@ -217,27 +220,28 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no
 
 def projection_robust_wasserstein(X, Y, a, b, tau, U0=None, reg=0.1, k=2, stopThr=1e-3, maxiter=100, verbose=0):
     r"""
-    Projection Robust Wasserstein Distance [32]
+    Projection Robust Wasserstein Distance :ref:`[32] <references-projection-robust-wasserstein>`
 
     The function solves the following optimization problem:
 
     .. math::
-        \max_{U \in St(d, k)} \min_{\pi \in \Pi(\mu,\nu)} \sum_{i,j} \pi_{i,j} \|U^T(x_i - y_j)\|^2 - reg * H(\pi)
+        \max_{U \in St(d, k)} \ \min_{\pi \in \Pi(\mu,\nu)} \quad \sum_{i,j} \pi_{i,j}
+        \|U^T(\mathbf{x}_i - \mathbf{y}_j)\|^2 - \mathrm{reg} \cdot H(\pi)
 
-    - :math:`U` is a linear projection operator in the Stiefel(d, k) manifold
+    - :math:`U` is a linear projection operator in the Stiefel(`d`, `k`) manifold
     - :math:`H(\pi)` is entropy regularizer
-    - :math:`x_i`, :math:`y_j` are samples of measures \mu and \nu respectively
+    - :math:`\mathbf{x}_i`, :math:`\mathbf{y}_j` are samples of measures :math:`\mu` and :math:`\nu` respectively
 
     Parameters
     ----------
     X : ndarray, shape (n, d)
-        Samples from measure \mu
+        Samples from measure :math:`\mu`
     Y : ndarray, shape (n, d)
-        Samples from measure \nu
+        Samples from measure :math:`\nu`
     a : ndarray, shape (n, )
-        weights for measure \mu
+        weights for measure :math:`\mu`
     b : ndarray, shape (n, )
-        weights for measure \nu
+        weights for measure :math:`\nu`
     tau : float
         stepsize for Riemannian Gradient Descent
     U0 : ndarray, shape (d, p)
@@ -258,6 +262,8 @@ def projection_robust_wasserstein(X, Y, a, b, tau, U0=None, reg=0.1, k=2, stopTh
     U : ndarray, shape (d, k)
         Projection operator.
 
+
+    .. _references-projection-robust-wasserstein:
     References
     ----------
     .. [32] Huang, M. , Ma S. & Lai L. (2021).
diff --git a/ot/gromov.py b/ot/gromov.py
index a0fbf48..465693d 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -327,7 +327,8 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     The function solves the following optimization problem:
 
     .. math::
-        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
 
@@ -410,7 +411,8 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        GW = \min_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
 
@@ -487,8 +489,8 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
     Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l}
-        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
 
@@ -569,7 +571,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
 
     .. math::
-        \min_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l}
+        \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l}
         L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
@@ -591,9 +593,9 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     M : array-like, shape (ns, nt)
         Metric cost matrix between features across domains
     C1 : array-like, shape (ns, ns)
-        Metric cost matrix respresentative of the structure in the source space.
+        Metric cost matrix representative of the structure in the source space.
     C2 : array-like, shape (nt, nt)
-        Metric cost matrix espresentative of the structure in the target space.
+        Metric cost matrix representative of the structure in the target space.
     p :  array-like, shape (ns,)
         Distribution in the source space.
     q :  array-like, shape (nt,)
@@ -612,8 +614,8 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
 
     Returns
     -------
-    gamma : array-like, shape (ns, nt)
-        Optimal transportation matrix for the given parameters.
+    fgw-distance : float
+        Fused gromov wasserstein distance for the given parameters.
     log : dict
         Log dictionary return only if log==True in parameters.
 
@@ -780,7 +782,8 @@ def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
     The function solves the following optimization problem:
 
     .. math::
-        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
@@ -901,7 +904,8 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
     The function solves the following optimization problem:
 
     .. math::
-        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
@@ -1052,7 +1056,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
+        \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
@@ -1157,7 +1161,8 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
+        GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})
+        \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
 
     Where :
 
@@ -1223,7 +1228,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
@@ -1336,7 +1341,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 2c18a88..5da897d 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -62,7 +62,7 @@ def center_ot_dual(alpha0, beta0, a=None, b=None):
     is the following:
 
     .. math::
-        \alpha^T a= \beta^T b
+        \alpha^T \mathbf{a} = \beta^T \mathbf{b}
 
     in addition to the OT problem constraints.
 
@@ -70,11 +70,11 @@ def center_ot_dual(alpha0, beta0, a=None, b=None):
     a constant from both  :math:`\alpha_0` and :math:`\beta_0`.
 
     .. math::
-        c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta}
+        c &= \frac{\beta_0^T \mathbf{b} - \alpha_0^T \mathbf{a}}{\mathbf{1}^T \mathbf{b} + \mathbf{1}^T \mathbf{a}}
 
-        \alpha=\alpha_0+c
+        \alpha &= \alpha_0 + c
 
-        \beta=\beta0+c
+        \beta &= \beta_0 + c
 
     Parameters
     ----------
@@ -117,7 +117,7 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M):
     The feasible values are computed efficiently but rather coarsely.
 
     .. warning::
-        This function is necessary because the C++ solver in emd_c
+        This function is necessary because the C++ solver in `emd_c`
         discards all samples in the distributions with
         zeros weights. This means that while the primal variable (transport
         matrix) is exact, the solver only returns feasible dual potentials
@@ -126,26 +126,26 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M):
     First we compute the constraints violations:
 
     .. math::
-        V=\alpha+\beta^T-M
+        \mathbf{V} = \alpha + \beta^T - \mathbf{M}
 
-    Next we compute the max amount of violation per row (alpha) and
-    columns (beta)
+    Next we compute the max amount of violation per row (:math:`\alpha`) and
+    columns (:math:`beta`)
 
     .. math::
-        v^a_i=\max_j V_{i,j}
+        \mathbf{v^a}_i = \max_j \mathbf{V}_{i,j}
 
-        v^b_j=\max_i V_{i,j}
+        \mathbf{v^b}_j = \max_i \mathbf{V}_{i,j}
 
     Finally we update the dual potential with 0 weights if a
     constraint is violated
 
     .. math::
-        \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0
+        \alpha_i = \alpha_i - \mathbf{v^a}_i \quad \text{ if } \mathbf{a}_i=0 \text{ and } \mathbf{v^a}_i>0
 
-        \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0
+        \beta_j = \beta_j - \mathbf{v^b}_j \quad \text{ if } \mathbf{b}_j=0 \text{ and } \mathbf{v^b}_j > 0
 
     In the end the dual potentials are centered using function
-    :ref:`center_ot_dual`.
+    :py:func:`ot.lp.center_ot_dual`.
 
     Note that all those updates do not change the objective value of the
     solution but provide dual potentials that do not violate the constraints.
@@ -201,26 +201,28 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True, numThreads=1):
     r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
-    .. math:: \gamma = arg\min_\gamma <\gamma,M>_F
+    .. math::
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
+
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-        s.t. \gamma 1 = a
+             \gamma^T \mathbf{1} = \mathbf{b}
 
-             \gamma^T 1= b
+             \gamma \geq 0
 
-             \gamma\geq 0
     where :
 
-    - M is the metric cost matrix
-    - a and b are the sample weights
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
 
-    .. warning:: Note that the M matrix in numpy needs to be a C-order
+    .. warning:: Note that the :math:`\mathbf{M}` matrix in numpy needs to be a C-order
         numpy.array in float64 format. It will be converted if not in this
         format
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
 
-    Uses the algorithm proposed in [1]_
+    Uses the algorithm proposed in :ref:`[1] <references-emd>`.
 
     Parameters
     ----------
@@ -267,17 +269,19 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True, numThreads=1):
     array([[0.5, 0. ],
            [0. , 0.5]])
 
+
+    .. _references-emd:
     References
     ----------
-
     .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W. (2011,
         December).  Displacement interpolation using Lagrangian mass transport.
         In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p. 158). ACM.
 
     See Also
     --------
-    ot.bregman.sinkhorn : Entropic regularized OT ot.optim.cg : General
-    regularized OT"""
+    ot.bregman.sinkhorn : Entropic regularized OT
+    ot.optim.cg : General regularized OT
+    """
 
     # convert to numpy if list
     a, b, M = list_to_array(a, b, M)
@@ -340,22 +344,23 @@ def emd2(a, b, M, processes=1,
     r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
-        \min_\gamma <\gamma,M>_F
+        \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
+
+             \gamma \geq 0
 
-             \gamma\geq 0
     where :
 
-    - M is the metric cost matrix
-    - a and b are the sample weights
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
 
-    Uses the algorithm proposed in [1]_
+    Uses the algorithm proposed in :ref:`[1] <references-emd2>`.
 
     Parameters
     ----------
@@ -405,9 +410,10 @@ def emd2(a, b, M, processes=1,
     >>> ot.emd2(a,b,M)
     0.0
 
+
+    .. _references-emd2:
     References
     ----------
-
     .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W.
         (2011, December).  Displacement interpolation using Lagrangian mass
         transport. In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p.
@@ -416,7 +422,8 @@ def emd2(a, b, M, processes=1,
     See Also
     --------
     ot.bregman.sinkhorn : Entropic regularized OT
-    ot.optim.cg : General regularized OT"""
+    ot.optim.cg : General regularized OT
+    """
 
     a, b, M = list_to_array(a, b, M)
 
@@ -508,29 +515,35 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance), formally:
 
     .. math::
-        \min_X \sum_{i=1}^N w_i W_2^2(b, X, a_i, X_i)
+        \min_\mathbf{X} \quad \sum_{i=1}^N w_i W_2^2(\mathbf{b}, \mathbf{X}, \mathbf{a}_i, \mathbf{X}_i)
 
     where :
 
     - :math:`w \in \mathbb{(0, 1)}^{N}`'s are the barycenter weights and sum to one
-    - the :math:`a_i \in \mathbb{R}^{k_i}` are the empirical measures weights and sum to one for each :math:`i`
-    - the :math:`X_i \in \mathbb{R}^{k_i, d}` are the empirical measures atoms locations
-    - :math:`b \in \mathbb{R}^{k}` is the desired weights vector of the barycenter
+    - the :math:`\mathbf{a}_i \in \mathbb{R}^{k_i}` are the empirical measures weights and sum to one for each :math:`i`
+    - the :math:`\mathbf{X}_i \in \mathbb{R}^{k_i, d}` are the empirical measures atoms locations
+    - :math:`\mathbf{b} \in \mathbb{R}^{k}` is the desired weights vector of the barycenter
 
-    This problem is considered in [1] (Algorithm 2). There are two differences with the following codes:
+    This problem is considered in :ref:`[1] <references-free-support-barycenter>` (Algorithm 2).
+    There are two differences with the following codes:
 
     - we do not optimize over the weights
-    - we do not do line search for the locations updates, we use i.e. theta = 1 in [1] (Algorithm 2). This can be seen as a discrete implementation of the fixed-point algorithm of [2] proposed in the continuous setting.
+    - we do not do line search for the locations updates, we use i.e. :math:`\theta = 1` in
+      :ref:`[1] <references-free-support-barycenter>` (Algorithm 2). This can be seen as a discrete
+      implementation of the fixed-point algorithm of
+      :ref:`[2] <references-free-support-barycenter>` proposed in the continuous setting.
 
     Parameters
     ----------
     measures_locations : list of N (k_i,d) numpy.ndarray
-        The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
+        The discrete support of a measure supported on :math:`k_i` locations of a `d`-dimensional space
+        (:math:`k_i` can be different for each element of the list)
     measures_weights : list of N (k_i,) numpy.ndarray
-        Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
+        Numpy arrays where each numpy array has :math:`k_i` non-negatives values summing to one
+        representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
-        Initialization of the support locations (on k atoms) of the barycenter
+        Initialization of the support locations (on `k` atoms) of the barycenter
     b : (k,) np.ndarray
         Initialization of the weights of the barycenter (non-negatives, sum to 1)
     weights : (N,) np.ndarray
@@ -554,9 +567,10 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     X : (k,d) np.ndarray
         Support locations (on k atoms) of the barycenter
 
+
+    .. _references-free-support-barycenter:
     References
     ----------
-
     .. [1] Cuturi, Marco, and Arnaud Doucet. "Fast computation of Wasserstein barycenters." International Conference on Machine Learning. 2014.
 
     .. [2]  Álvarez-Esteban, Pedro C., et al. "A fixed-point approach to barycenters in Wasserstein space." Journal of Mathematical Analysis and Applications 441.2 (2016): 744-762.
diff --git a/ot/optim.py b/ot/optim.py
index 6456c03..cc286b6 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -162,7 +162,8 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000,
         The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot f(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot f(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -309,7 +310,8 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
         The function solves the following optimization problem:
 
     .. math::
-        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma)
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
@@ -452,7 +454,7 @@ def solve_1d_linesearch_quad(a, b, c):
 
     .. math::
 
-        \mathop{\arg \min}_{0 \leq x \leq 1} f(x) = ax^{2} + bx + c
+        \mathop{\arg \min}_{0 \leq x \leq 1} \quad f(x) = ax^{2} + bx + c
 
     Parameters
     ----------
diff --git a/ot/partial.py b/ot/partial.py
index 814d779..b7093e4 100755
--- a/ot/partial.py
+++ b/ot/partial.py
@@ -20,13 +20,16 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
     The function considers the following problem:
 
     .. math::
-        \gamma = \arg\min_\gamma <\gamma,(M-\lambda)>_F
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, (\mathbf{M} - \lambda) \rangle_F
 
-        s.t.
-             \gamma\geq 0 \\
-             \gamma 1 \leq a\\
-             \gamma^T 1 \leq b\\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+    .. math::
+        s.t. \ \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
+
+             \gamma &\geq 0
+
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
 
     or equivalently (see Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X.
@@ -34,33 +37,32 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
     metrics. Foundations of Computational Mathematics, 18(1), 1-44.)
 
     .. math::
-        \gamma = \arg\min_\gamma <\gamma,M>_F  + \sqrt(\lambda/2)
-        (\|\gamma 1 - a\|_1 + \|\gamma^T 1 - b\|_1)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F  +
+        \sqrt{\frac{\lambda}{2} (\|\gamma \mathbf{1} - \mathbf{a}\|_1 + \|\gamma^T \mathbf{1} - \mathbf{b}\|_1)}
 
-        s.t.
-             \gamma\geq 0 \\
+        s.t. \ \gamma \geq 0
 
 
     where :
 
-    - M is the metric cost matrix
-    - a and b are source and target unbalanced distributions
-    - :math:`\lambda` is the lagragian cost. Tuning its value allows attaining
-      a given mass to be transported m
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
+    - :math:`\lambda` is the lagrangian cost. Tuning its value allows attaining
+      a given mass to be transported `m`
 
-    The formulation of the problem has been proposed in [28]_
+    The formulation of the problem has been proposed in :ref:`[28] <references-partial-wasserstein-lagrange>`
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,)
-        Unnormalized histograms of dimension dim_b
+        Unnormalized histograms of dimension `dim_b`
     M : np.ndarray (dim_a, dim_b)
         cost matrix for the quadratic cost
     reg_m : float, optional
-        Lagragian cost
+        Lagrangian cost
     nb_dummies : int, optional, default:1
         number of reservoir points to be added (to avoid numerical
         instabilities, increase its value if an error is raised)
@@ -69,6 +71,7 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
     **kwargs : dict
         parameters can be directly passed to the emd solver
 
+
     .. warning::
         When dealing with a large number of points, the EMD solver may face
         some instabilities, especially when the mass associated to the dummy
@@ -77,7 +80,7 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
 
     Returns
     -------
-    gamma : (dim_a x dim_b) ndarray
+    gamma : (dim_a, dim_b) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
@@ -97,9 +100,10 @@ def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
     array([[0.1, 0. ],
            [0. , 0. ]])
 
+
+    .. _references-partial-wasserstein-lagrange:
     References
     ----------
-
     .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in
        optimal transport and Monge-Ampere obstacle problems. Annals of
        mathematics, 673-730.
@@ -162,27 +166,30 @@ def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
     The function considers the following problem:
 
     .. math::
-        \gamma = \arg\min_\gamma <\gamma,M>_F
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
+
+    .. math::
+        s.t. \ \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
+
+             \gamma &\geq 0
 
-        s.t.
-             \gamma\geq 0 \\
-             \gamma 1 \leq a\\
-             \gamma^T 1 \leq b\\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
 
     where :
 
-    - M is the metric cost matrix
-    - a and b are source and target unbalanced distributions
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
+    - `m` is the amount of mass to be transported
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,)
-        Unnormalized histograms of dimension dim_b
+        Unnormalized histograms of dimension `dim_b`
     M : np.ndarray (dim_a, dim_b)
         cost matrix for the quadratic cost
     m : float, optional
@@ -205,7 +212,7 @@ def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
 
     Returns
     -------
-    :math:`gamma` : (dim_a x dim_b) ndarray
+    gamma : (dim_a, dim_b) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
@@ -278,27 +285,30 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
     The function considers the following problem:
 
     .. math::
-        \gamma = \arg\min_\gamma <\gamma,M>_F
+        \gamma = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
 
-        s.t.
-             \gamma\geq 0 \\
-             \gamma 1 \leq a\\
-             \gamma^T 1 \leq b\\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+    .. math::
+        s.t. \ \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
+
+             \gamma &\geq 0
+
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
 
     where :
 
-    - M is the metric cost matrix
-    - a and b are source and target unbalanced distributions
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
+    - `m` is the amount of mass to be transported
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,)
-        Unnormalized histograms of dimension dim_b
+        Unnormalized histograms of dimension `dim_b`
     M : np.ndarray (dim_a, dim_b)
         cost matrix for the quadratic cost
     m : float, optional
@@ -321,8 +331,8 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
 
     Returns
     -------
-    :math:`gamma` : (dim_a x dim_b) ndarray
-        Optimal transportation matrix for the given parameters
+    GW: float
+        partial GW discrepancy
     log : dict
         log dictionary returned only if `log` is `True`
 
@@ -360,8 +370,8 @@ def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
 
 
 def gwgrad_partial(C1, C2, T):
-    """Compute the GW gradient. Note: we can not use the trick in [12]_  as
-    the marginals may not sum to 1.
+    """Compute the GW gradient. Note: we can not use the trick in :ref:`[12] <references-gwgrad-partial>`
+    as the marginals may not sum to 1.
 
     Parameters
     ----------
@@ -379,6 +389,8 @@ def gwgrad_partial(C1, C2, T):
     numpy.array of shape (n_p+nb_dummies, n_u)
         gradient
 
+
+    .. _references-gwgrad-partial:
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
@@ -425,22 +437,25 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
     The function considers the following problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
+
+    .. math::
+        s.t. \ \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
 
-        s.t. \gamma 1 \leq a \\
-             \gamma^T 1 \leq b \\
-             \gamma\geq 0 \\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+             \gamma &\geq 0
+
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
     where :
 
-    - M is the metric cost matrix
-    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)
-        =\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are the sample weights
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma) = \sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
+    - `m` is the amount of mass to be transported
 
-    The formulation of the problem has been proposed in [29]_
+    The formulation of the problem has been proposed in :ref:`[29] <references-partial-gromov-wasserstein>`
 
 
     Parameters
@@ -454,7 +469,7 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
     q : ndarray, shape (nt,)
         Distribution in the target space
     m : float, optional
-        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+        Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
     nb_dummies : int, optional
         Number of dummy points to add (avoid instabilities in the EMD solver)
     G0 : ndarray, shape (ns, nt), optional
@@ -476,7 +491,7 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
 
     Returns
     -------
-    gamma : (dim_a x dim_b) ndarray
+    gamma : (dim_a, dim_b) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
@@ -503,6 +518,8 @@ def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
            [0.  , 0.  , 0.25, 0.  ],
            [0.  , 0.  , 0.  , 0.  ]])
 
+
+    .. _references-partial-gromov-wasserstein:
     References
     ----------
     ..  [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
@@ -597,22 +614,25 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
     The function considers the following problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F
+        GW = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
+
+    .. math::
+        s.t. \ \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
+
+             \gamma &\geq 0
 
-        s.t. \gamma 1 \leq a \\
-             \gamma^T 1 \leq b \\
-             \gamma\geq 0 \\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
     where :
 
-    - M is the metric cost matrix
-    - :math:`\Omega`  is the entropic regularization term
-        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are the sample weights
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma) = \sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
+    - `m` is the amount of mass to be transported
 
-    The formulation of the problem has been proposed in [29]_
+    The formulation of the problem has been proposed in :ref:`[29] <references-partial-gromov-wasserstein2>`
 
 
     Parameters
@@ -626,7 +646,7 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
     q : ndarray, shape (nt,)
         Distribution in the target space
     m : float, optional
-        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+        Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
     nb_dummies : int, optional
         Number of dummy points to add (avoid instabilities in the EMD solver)
     G0 : ndarray, shape (ns, nt), optional
@@ -655,7 +675,7 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
 
     Returns
     -------
-    partial_gw_dist : (dim_a x dim_b) ndarray
+    partial_gw_dist : float
         partial GW discrepancy
     log : dict
         log dictionary returned only if `log` is `True`
@@ -676,6 +696,8 @@ def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
     >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b, m=0.25),2)
     0.0
 
+
+    .. _references-partial-gromov-wasserstein2:
     References
     ----------
     ..  [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
@@ -706,30 +728,29 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000,
     The function considers the following problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 \leq a \\
-             \gamma^T 1 \leq b \\
-             \gamma\geq 0 \\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+        s.t. \gamma \mathbf{1} &\leq \mathbf{a} \\
+             \gamma^T \mathbf{1} &\leq \mathbf{b} \\
+             \gamma &\geq 0 \\
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\} \\
 
     where :
 
-    - M is the metric cost matrix
-    - :math:`\Omega`  is the entropic regularization term
-        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are the sample weights
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\Omega`  is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
+    - `m` is the amount of mass to be transported
 
-    The formulation of the problem has been proposed in [3]_ (prop. 5)
+    The formulation of the problem has been proposed in :ref:`[3] <references-entropic-partial-wasserstein>` (prop. 5)
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,)
-        Unnormalized histograms of dimension dim_b
+        Unnormalized histograms of dimension `dim_b`
     M : np.ndarray (dim_a, dim_b)
         cost matrix
     reg : float
@@ -748,7 +769,7 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000,
 
     Returns
     -------
-    gamma : (dim_a x dim_b) ndarray
+    gamma : (dim_a, dim_b) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
@@ -764,6 +785,8 @@ def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000,
     array([[0.06, 0.02],
            [0.01, 0.  ]])
 
+
+    .. _references-entropic-partial-wasserstein:
     References
     ----------
     .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
@@ -838,32 +861,34 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None,
                                         numItermax=1000, tol=1e-7, log=False,
                                         verbose=False):
     r"""
-    Returns the partial Gromov-Wasserstein transport between (C1,p) and (C2,q)
+    Returns the partial Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot
-            \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_{\gamma} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})\cdot
+        \gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma)
+
+    .. math::
+        s.t. \ \gamma &\geq 0
+
+             \gamma \mathbf{1} &\leq \mathbf{a}
 
-        s.t.
-             \gamma\geq 0 \\
-             \gamma 1 \leq a\\
-             \gamma^T 1 \leq b\\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
+
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
     where :
 
-    - C1 is the metric cost matrix in the source space
-    - C2 is the metric cost matrix in the target space
-    - p and q are the sample weights
-    - L  : quadratic loss function
-    - :math:`\Omega`  is the entropic regularization term
-        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{C_1}` is the metric cost matrix in the source space
+    - :math:`\mathbf{C_2}` is the metric cost matrix in the target space
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights
+    - `L`: quadratic loss function
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - `m` is the amount of mass to be transported
 
-    The formulation of the GW problem has been proposed in [12]_ and the
-    partial GW in [29]_.
+    The formulation of the GW problem has been proposed in :ref:`[12] <references-entropic-partial-gromov-wassertein>` and the partial GW in :ref:`[29] <references-entropic-partial-gromov-wassertein>`
 
     Parameters
     ----------
@@ -878,7 +903,7 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None,
     reg: float
         entropic regularization parameter
     m : float, optional
-        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+        Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
     G0 : ndarray, shape (ns, nt), optional
         Initialisation of the transportation matrix
     numItermax : int, optional
@@ -913,17 +938,20 @@ def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None,
 
     Returns
     -------
-    :math: `gamma` : (dim_a x dim_b) ndarray
+    :math: `gamma` : (dim_a, dim_b) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
 
+
+    .. _references-entropic-partial-gromov-wassertein:
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
-    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
+
+    .. [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
         Transport with Applications on Positive-Unlabeled Learning".
         NeurIPS.
 
@@ -977,33 +1005,33 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None,
                                          numItermax=1000, tol=1e-7, log=False,
                                          verbose=False):
     r"""
-    Returns the partial Gromov-Wasserstein discrepancy between (C1,p) and
-    (C2,q)
+    Returns the partial Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot
-            \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma)
+        GW = \min_{\gamma} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})\cdot
+            \gamma_{i,j}\cdot\gamma_{k,l} + \mathrm{reg} \cdot\Omega(\gamma)
+
+    .. math::
+        s.t. \ \gamma &\geq 0
+
+             \gamma \mathbf{1} &\leq \mathbf{a}
+
+             \gamma^T \mathbf{1} &\leq \mathbf{b}
 
-        s.t.
-             \gamma\geq 0 \\
-             \gamma 1 \leq a\\
-             \gamma^T 1 \leq b\\
-             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+             \mathbf{1}^T \gamma^T \mathbf{1} = m &\leq \min\{\|\mathbf{a}\|_1, \|\mathbf{b}\|_1\}
 
     where :
 
-    - C1 is the metric cost matrix in the source space
-    - C2 is the metric cost matrix in the target space
-    - p and q are the sample weights
-    - L  : quadratic loss function
-    - :math:`\Omega`  is the entropic regularization term
-        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - m is the amount of mass to be transported
+    - :math:`\mathbf{C_1}` is the metric cost matrix in the source space
+    - :math:`\mathbf{C_2}` is the metric cost matrix in the target space
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are the sample weights
+    - `L` : quadratic loss function
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - `m` is the amount of mass to be transported
 
-    The formulation of the GW problem has been proposed in [12]_ and the
-    partial GW in [29]_.
+    The formulation of the GW problem has been proposed in :ref:`[12] <references-entropic-partial-gromov-wassertein2>` and the partial GW in :ref:`[29] <references-entropic-partial-gromov-wassertein2>`
 
 
     Parameters
@@ -1019,7 +1047,7 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None,
     reg: float
         entropic regularization parameter
     m : float, optional
-        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+        Amount of mass to be transported (default: :math:`\min\{\|\mathbf{p}\|_1, \|\mathbf{q}\|_1\}`)
     G0 : ndarray, shape (ns, nt), optional
         Initialisation of the transportation matrix
     numItermax : int, optional
@@ -1052,11 +1080,14 @@ def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None,
     >>> np.round(entropic_partial_gromov_wasserstein2(C1, C2, a, b,50), 2)
     1.87
 
+
+    .. _references-entropic-partial-gromov-wassertein2:
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
+
     ..  [29] Chapel, L., Alaya, M., Gasso, G. (2020). "Partial Optimal
         Transport with Applications on Positive-Unlabeled Learning".
         NeurIPS.
diff --git a/ot/plot.py b/ot/plot.py
index ad436b4..3e3bed7 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -18,10 +18,10 @@ from matplotlib import gridspec
 
 
 def plot1D_mat(a, b, M, title=''):
-    """ Plot matrix M  with the source and target 1D distribution
+    """ Plot matrix :math:`\mathbf{M}`  with the source and target 1D distribution
 
-    Creates a subplot with the source distribution a on the left and
-    target distribution b on the tot. The matrix M is shown in between.
+    Creates a subplot with the source distribution :math:`\mathbf{a}` on the left and
+    target distribution :math:`\mathbf{b}` on the top. The matrix :math:`\mathbf{M}` is shown in between.
 
 
     Parameters
@@ -61,10 +61,10 @@ def plot1D_mat(a, b, M, title=''):
 
 
 def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs):
-    """ Plot matrix M  in 2D with  lines using alpha values
+    """ Plot matrix :math:`\mathbf{G}` in 2D with lines using alpha values
 
     Plot lines between source and target 2D samples with a color
-    proportional to the value of the matrix G between samples.
+    proportional to the value of the matrix :math:`\mathbf{G}` between samples.
 
 
     Parameters
diff --git a/ot/sliced.py b/ot/sliced.py
index d3dc3f2..7c09111 100644
--- a/ot/sliced.py
+++ b/ot/sliced.py
@@ -17,7 +17,7 @@ from .utils import list_to_array
 
 def get_random_projections(d, n_projections, seed=None, backend=None, type_as=None):
     r"""
-    Generates n_projections samples from the uniform on the unit sphere of dimension d-1: :math:`\mathcal{U}(\mathcal{S}^{d-1})`
+    Generates n_projections samples from the uniform on the unit sphere of dimension :math:`d-1`: :math:`\mathcal{U}(\mathcal{S}^{d-1})`
 
     Parameters
     ----------
@@ -67,11 +67,12 @@ def sliced_wasserstein_distance(X_s, X_t, a=None, b=None, n_projections=50, p=2,
     Computes a Monte-Carlo approximation of the p-Sliced Wasserstein distance
 
     .. math::
-        \mathcal{SWD}_p(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}[\mathcal{W}_p^p(\theta_\# \mu, \theta_\# \nu)]^{\frac{1}{p}}
+        \mathcal{SWD}_p(\mu, \nu) = \underset{\theta \sim \mathcal{U}(\mathbb{S}^{d-1})}{\mathbb{E}}\left(\mathcal{W}_p^p(\theta_\# \mu, \theta_\# \nu)\right)^{\frac{1}{p}}
+
 
     where :
 
-    - :math:`\theta_\# \mu` stands for the pushforwars of the projection :math:`\mathbb{R}^d \ni X \mapsto \langle \theta, X \rangle`
+    - :math:`\theta_\# \mu` stands for the pushforwards of the projection :math:`X \in \mathbb{R}^d \mapsto \langle \theta, X \rangle`
 
 
     Parameters
diff --git a/ot/smooth.py b/ot/smooth.py
index ea26bae..6855005 100644
--- a/ot/smooth.py
+++ b/ot/smooth.py
@@ -47,15 +47,24 @@ from scipy.optimize import minimize
 
 
 def projection_simplex(V, z=1, axis=None):
-    """ Projection of x onto the simplex, scaled by z
+    r""" Projection of :math:`\mathbf{V}` onto the simplex, scaled by `z`
 
-        P(x; z) = argmin_{y >= 0, sum(y) = z} ||y - x||^2
+    .. math::
+        P\left(\mathbf{V}, z\right) = \mathop{\arg \min}_{\substack{\mathbf{y} >= 0 \\ \sum_i \mathbf{y}_i = z}} \quad \|\mathbf{y} - \mathbf{V}\|^2
+
+    Parameters
+    ----------
+    V: ndarray, rank 2
     z: float or array
-        If array, len(z) must be compatible with V
+        If array, len(z) must be compatible with :math:`\mathbf{V}`
     axis: None or int
-        - axis=None: project V by P(V.ravel(); z)
-        - axis=1: project each V[i] by P(V[i]; z[i])
-        - axis=0: project each V[:, j] by P(V[:, j]; z[j])
+        - axis=None: project :math:`\mathbf{V}` by :math:`P(\mathbf{V}.\mathrm{ravel}(), z)`
+        - axis=1: project each :math:`\mathbf{V}_i` by :math:`P(\mathbf{V}_i, z_i)`
+        - axis=0: project each :math:`\mathbf{V}_{:, j}` by :math:`P(\mathbf{V}_{:, j}, z_j)`
+
+    Returns
+    -------
+    projection: ndarray, shape :math:`\mathbf{V}`.shape
     """
     if axis == 1:
         n_features = V.shape[1]
@@ -77,12 +86,12 @@ def projection_simplex(V, z=1, axis=None):
 
 
 class Regularization(object):
-    """Base class for Regularization objects
+    r"""Base class for Regularization objects
 
         Notes
         -----
-        This class is not intended for direct use but as aparent for true
-        regularizatiojn implementation.
+        This class is not intended for direct use but as apparent for true
+        regularization implementation.
     """
 
     def __init__(self, gamma=1.0):
@@ -98,40 +107,48 @@ class Regularization(object):
         self.gamma = gamma
 
     def delta_Omega(X):
-        """
-        Compute delta_Omega(X[:, j]) for each X[:, j].
-        delta_Omega(x) = sup_{y >= 0} y^T x - Omega(y).
+        r"""
+        Compute :math:`\delta_\Omega(\mathbf{X}_{:, j})` for each :math:`\mathbf{X}_{:, j}`.
+
+        .. math::
+            \delta_\Omega(\mathbf{x}) = \sup_{\mathbf{y} >= 0} \
+            \mathbf{y}^T \mathbf{x} - \Omega(\mathbf{y})
 
         Parameters
         ----------
-        X: array, shape = len(a) x len(b)
+        X: array, shape = (len(a), len(b))
             Input array.
 
         Returns
         -------
-        v: array, len(b)
-            Values: v[j] = delta_Omega(X[:, j])
-        G: array, len(a) x len(b)
-            Gradients: G[:, j] = nabla delta_Omega(X[:, j])
+        v: array, (len(b), )
+            Values: :math:`\mathbf{v}_j = \delta_\Omega(\mathbf{X}_{:, j})`
+        G: array, (len(a), len(b))
+            Gradients: :math:`\mathbf{G}_{:, j} = \nabla \delta_\Omega(\mathbf{X}_{:, j})`
         """
         raise NotImplementedError
 
     def max_Omega(X, b):
-        """
-        Compute max_Omega_j(X[:, j]) for each X[:, j].
-        max_Omega_j(x) = sup_{y >= 0, sum(y) = 1} y^T x - Omega(b[j] y) / b[j].
+        r"""
+        Compute :math:`\mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})` for each :math:`\mathbf{X}_{:, j}`.
+
+        .. math::
+            \mathrm{max}_{\Omega, j}(\mathbf{x}) =
+            \sup_{\substack{\mathbf{y} >= 0 \ \sum_i \mathbf{y}_i = 1}}
+            \mathbf{y}^T \mathbf{x} - \frac{1}{\mathbf{b}_j} \Omega(\mathbf{b}_j \mathbf{y})
 
         Parameters
         ----------
-        X: array, shape = len(a) x len(b)
+        X: array, shape = (len(a), len(b))
             Input array.
+        b: array, shape = (len(b), )
 
         Returns
         -------
-        v: array, len(b)
-            Values: v[j] = max_Omega_j(X[:, j])
-        G: array, len(a) x len(b)
-            Gradients: G[:, j] = nabla max_Omega_j(X[:, j])
+        v: array, (len(b), )
+            Values: :math:`\mathbf{v}_j = \mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})`
+        G: array, (len(a), len(b))
+            Gradients: :math:`\mathbf{G}_{:, j} = \nabla \mathrm{max}_{\Omega, j}(\mathbf{X}_{:, j})`
         """
         raise NotImplementedError
 
@@ -192,7 +209,7 @@ class SquaredL2(Regularization):
 
 
 def dual_obj_grad(alpha, beta, a, b, C, regul):
-    """
+    r"""
     Compute objective value and gradients of dual objective.
 
     Parameters
@@ -203,19 +220,19 @@ def dual_obj_grad(alpha, beta, a, b, C, regul):
     a: array, shape = len(a)
     b: array, shape = len(b)
         Input histograms (should be non-negative and sum to 1).
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a delta_Omega(X) method.
+        Should implement a `delta_Omega(X)` method.
 
     Returns
     -------
     obj: float
         Objective value (higher is better).
     grad_alpha: array, shape = len(a)
-        Gradient w.r.t. alpha.
+        Gradient w.r.t. `alpha`.
     grad_beta: array, shape = len(b)
-        Gradient w.r.t. beta.
+        Gradient w.r.t. `beta`.
     """
     obj = np.dot(alpha, a) + np.dot(beta, b)
     grad_alpha = a.copy()
@@ -242,13 +259,13 @@ def solve_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
 
     Parameters
     ----------
-    a: array, shape = len(a)
-    b: array, shape = len(b)
+    a: array, shape = (len(a), )
+    b: array, shape = (len(b), )
         Input histograms (should be non-negative and sum to 1).
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a delta_Omega(X) method.
+        Should implement a `delta_Omega(X)` method.
     method: str
         Solver to be used (passed to `scipy.optimize.minimize`).
     tol: float
@@ -258,8 +275,8 @@ def solve_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
 
     Returns
     -------
-    alpha: array, shape = len(a)
-    beta: array, shape = len(b)
+    alpha: array, shape = (len(a), )
+    beta: array, shape = (len(b), )
         Dual potentials.
     """
 
@@ -302,10 +319,10 @@ def semi_dual_obj_grad(alpha, a, b, C, regul):
     a: array, shape = len(a)
     b: array, shape = len(b)
         Input histograms (should be non-negative and sum to 1).
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a max_Omega(X) method.
+        Should implement a `max_Omega(X)` method.
 
     Returns
     -------
@@ -337,13 +354,13 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
 
     Parameters
     ----------
-    a: array, shape = len(a)
-    b: array, shape = len(b)
+    a: array, shape = (len(a), )
+    b: array, shape = (len(b), )
         Input histograms (should be non-negative and sum to 1).
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a max_Omega(X) method.
+        Should implement a `max_Omega(X)` method.
     method: str
         Solver to be used (passed to `scipy.optimize.minimize`).
     tol: float
@@ -353,7 +370,7 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
 
     Returns
     -------
-    alpha: array, shape = len(a)
+    alpha: array, shape = (len(a), )
         Semi-dual potentials.
     """
 
@@ -371,7 +388,7 @@ def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
 
 
 def get_plan_from_dual(alpha, beta, C, regul):
-    """
+    r"""
     Retrieve optimal transportation plan from optimal dual potentials.
 
     Parameters
@@ -379,14 +396,14 @@ def get_plan_from_dual(alpha, beta, C, regul):
     alpha: array, shape = len(a)
     beta: array, shape = len(b)
         Optimal dual potentials.
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a delta_Omega(X) method.
+        Should implement a `delta_Omega(X)` method.
 
     Returns
     -------
-    T: array, shape = len(a) x len(b)
+    T: array, shape = (len(a), len(b))
         Optimal transportation plan.
     """
     X = alpha[:, np.newaxis] + beta - C
@@ -394,7 +411,7 @@ def get_plan_from_dual(alpha, beta, C, regul):
 
 
 def get_plan_from_semi_dual(alpha, b, C, regul):
-    """
+    r"""
     Retrieve optimal transportation plan from optimal semi-dual potentials.
 
     Parameters
@@ -403,14 +420,14 @@ def get_plan_from_semi_dual(alpha, b, C, regul):
         Optimal semi-dual potentials.
     b: array, shape = len(b)
         Second input histogram (should be non-negative and sum to 1).
-    C: array, shape = len(a) x len(b)
+    C: array, shape = (len(a), len(b))
         Ground cost matrix.
     regul: Regularization object
-        Should implement a delta_Omega(X) method.
+        Should implement a `delta_Omega(X)` method.
 
     Returns
     -------
-    T: array, shape = len(a) x len(b)
+    T: array, shape = (len(a), len(b))
         Optimal transportation plan.
     """
     X = alpha[:, np.newaxis] - C
@@ -422,19 +439,21 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9,
     r"""
     Solve the regularized OT problem in the dual and return the OT matrix
 
-    The function solves the smooth relaxed dual formulation (7) in [17]_ :
+    The function solves the smooth relaxed dual formulation (7) in
+    :ref:`[17] <references-smooth-ot-dual>`:
 
     .. math::
-        \max_{\alpha,\beta}\quad a^T\alpha+b^T\beta-\sum_j\delta_\Omega(\alpha+\beta_j-\mathbf{m}_j)
+        \max_{\alpha,\beta}\quad \mathbf{a}^T\alpha + \mathbf{b}^T\beta -
+        \sum_j \delta_\Omega \left(\alpha+\beta_j-\mathbf{m}_j \right)
 
     where :
 
-    - :math:`\mathbf{m}_j` is the jth column of the cost matrix
+    - :math:`\mathbf{m}_j` is the j-th column of the cost matrix
     - :math:`\delta_\Omega` is the convex conjugate of the regularization term :math:`\Omega`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The OT matrix can is reconstructed from the gradient of :math:`\delta_\Omega`
-    (See [17]_ Proposition 1).
+    (See :ref:`[17] <references-smooth-ot-dual>` Proposition 1).
     The optimization algorithm is using gradient decent (L-BFGS by default).
 
 
@@ -444,15 +463,19 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9,
         samples weights in the source domain
     b : np.ndarray (nt,) or np.ndarray (nt,nbb)
         samples in the target domain, compute sinkhorn with multiple targets
-        and fixed M if b is a matrix (return OT loss + dual variables in log)
+        and fixed :math:`\mathbf{M}` if :math:`\mathbf{b}` is a matrix
+        (return OT loss + dual variables in log)
     M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term >0
     reg_type : str
-        Regularization type,  can be the following (default ='l2'):
-        - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_)
-        - 'l2' : Squared Euclidean regularization
+        Regularization type, can be the following (default ='l2'):
+
+            - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn
+              :ref:`[2] <references-smooth-ot-dual>`)
+
+            - 'l2' : Squared Euclidean regularization
     method : str
         Solver to use for scipy.optimize.minimize
     numItermax : int, optional
@@ -467,15 +490,15 @@ def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-smooth-ot-dual:
     References
     ----------
-
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
 
     .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS).
@@ -514,21 +537,23 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=
     r"""
     Solve the regularized OT problem in the semi-dual and return the OT matrix
 
-    The function solves the smooth relaxed dual formulation (10) in [17]_ :
+    The function solves the smooth relaxed dual formulation (10) in
+    :ref:`[17] <references-smooth-ot-semi-dual>`:
 
     .. math::
-        \max_{\alpha}\quad a^T\alpha-OT_\Omega^*(\alpha,b)
+        \max_{\alpha}\quad \mathbf{a}^T\alpha- \mathrm{OT}_\Omega^*(\alpha, \mathbf{b})
 
     where :
 
     .. math::
-        OT_\Omega^*(\alpha,b)=\sum_j b_j
+        \mathrm{OT}_\Omega^*(\alpha,b)=\sum_j \mathbf{b}_j
 
-    - :math:`\mathbf{m}_j` is the jth column of the cost matrix
-    - :math:`OT_\Omega^*(\alpha,b)` is defined in Eq. (9) in [17]
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{m}_j` is the j-th column of the cost matrix
+    - :math:`\mathrm{OT}_\Omega^*(\alpha,b)` is defined in Eq. (9) in
+      :ref:`[17] <references-smooth-ot-semi-dual>`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
-    The OT matrix can is reconstructed using [17]_ Proposition 2.
+    The OT matrix can is reconstructed using :ref:`[17] <references-smooth-ot-semi-dual>` Proposition 2.
     The optimization algorithm is using gradient decent (L-BFGS by default).
 
 
@@ -538,15 +563,19 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=
         samples weights in the source domain
     b : np.ndarray (nt,) or np.ndarray (nt,nbb)
         samples in the target domain, compute sinkhorn with multiple targets
-        and fixed M if b is a matrix (return OT loss + dual variables in log)
+        and fixed:math:`\mathbf{M}` if :math:`\mathbf{b}` is a matrix
+        (return OT loss + dual variables in log)
     M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term >0
     reg_type : str
-        Regularization type,  can be the following (default ='l2'):
-        - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_)
-        - 'l2' : Squared Euclidean regularization
+        Regularization type, can be the following (default ='l2'):
+
+            - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn
+              :ref:`[2] <references-smooth-ot-semi-dual>`)
+
+            - 'l2' : Squared Euclidean regularization
     method : str
         Solver to use for scipy.optimize.minimize
     numItermax : int, optional
@@ -561,15 +590,15 @@ def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : (ns, nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
 
+    .. _references-smooth-ot-semi-dual:
     References
     ----------
-
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
 
     .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS).
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 13ed9cc..693675f 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -18,22 +18,25 @@ import numpy as np
 
 def coordinate_grad_semi_dual(b, M, reg, beta, i):
     r'''
-    Compute the coordinate gradient update for regularized discrete distributions for (i, :)
+    Compute the coordinate gradient update for regularized discrete distributions for :math:`(i, :)`
 
     The function computes the gradient of the semi dual problem:
 
     .. math::
-        \max_v \sum_i (\sum_j v_j * b_j - reg * log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i
+        \max_\mathbf{v} \ \sum_i \mathbf{a}_i \left[ \sum_j \mathbf{v}_j \mathbf{b}_j - \mathrm{reg}
+        \cdot \log \left( \sum_j \mathbf{b}_j
+        \exp \left( \frac{\mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}}
+        \right) \right) \right]
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - v is a dual variable in R^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{v}` is a dual variable in :math:`\mathbb{R}^{nt}`
     - reg is the regularization term
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the ASGD & SAG algorithms
-    as proposed in [18]_ [alg.1 & alg.2]
+    as proposed in :ref:`[18] <references-coordinate-grad-semi-dual>` [alg.1 & alg.2]
 
 
     Parameters
@@ -47,7 +50,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
     v : ndarray, shape (nt,)
         Dual variable.
     i : int
-        Picked number i.
+        Picked number `i`.
 
     Returns
     -------
@@ -74,12 +77,10 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
 
+    .. _references-coordinate-grad-semi-dual:
     References
     ----------
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
     r = M[i, :] - beta
     exp_beta = np.exp(-r / reg) * b
@@ -88,29 +89,29 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
 
 def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
-    r'''
-    Compute the SAG algorithm to solve the regularized discrete measures
-        optimal transport max problem
+    r"""
+    Compute the SAG algorithm to solve the regularized discrete measures optimal transport max problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1 = b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the SAG algorithm
-    as proposed in [18]_ [alg.1]
+    as proposed in :ref:`[18] <references-sag-entropic-transport>` [alg.1]
 
 
     Parameters
@@ -131,7 +132,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
 
     Returns
     -------
-    v : ndarray, shape (nt,)
+    v : ndarray, shape (`nt`,)
         Dual variable.
 
     Examples
@@ -154,14 +155,12 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-sag-entropic-transport:
     References
     ----------
-
-    [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
-    '''
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
+    """
 
     if lr is None:
         lr = 1. / max(a / reg)
@@ -187,22 +186,23 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     The algorithm used for solving the problem is the ASGD algorithm
-    as proposed in [18]_ [alg.2]
+    as proposed in :ref:`[18] <references-averaged-sgd-entropic-transport>` [alg.2]
 
 
     Parameters
@@ -220,7 +220,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
     Returns
     -------
-    ave_v : ndarray, shape (nt,)
+    ave_v : ndarray, shape (`nt`,)
         dual variable
 
     Examples
@@ -243,13 +243,11 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-averaged-sgd-entropic-transport:
     References
     ----------
-
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     if lr is None:
@@ -271,20 +269,21 @@ def c_transform_entropic(b, M, reg, beta):
     r'''
     The goal is to recover u from the c-transform.
 
-    The function computes the c_transform of a dual variable from the other
+    The function computes the c-transform of a dual variable from the other
     dual variable:
 
     .. math::
-        u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j
+        \mathbf{u} = \mathbf{v}^{c,reg} = - \mathrm{reg} \sum_j \mathbf{b}_j
+        \exp\left( \frac{\mathbf{v} - \mathbf{M}}{\mathrm{reg}} \right)
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - u, v are dual variables in R^IxR^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}`
     - reg is the regularization term
 
     It is used to recover an optimal u from optimal v solving the semi dual
-    problem, see Proposition 2.1 of [18]_
+    problem, see Proposition 2.1 of :ref:`[18] <references-c-transform-entropic>`
 
 
     Parameters
@@ -300,7 +299,7 @@ def c_transform_entropic(b, M, reg, beta):
 
     Returns
     -------
-    u : ndarray, shape (ns,)
+    u : ndarray, shape (`ns`,)
         Dual variable.
 
     Examples
@@ -323,13 +322,11 @@ def c_transform_entropic(b, M, reg, beta):
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-c-transform-entropic:
     References
     ----------
-
-    [Genevay et al., 2016] :
-        Stochastic Optimization for Large-scale Optimal Transport,
-         Advances in Neural Information Processing Systems (2016),
-          arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     n_source = np.shape(M)[0]
@@ -345,27 +342,28 @@ def c_transform_entropic(b, M, reg, beta):
 def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                 log=False):
     r'''
-    Compute the transportation matrix to solve the regularized discrete
-        measures optimal transport max problem
+    Compute the transportation matrix to solve the regularized discrete measures optimal transport max problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
+
     The algorithm used for solving the problem is the SAG or ASGD algorithms
-    as proposed in [18]_
+    as proposed in :ref:`[18] <references-solve-semi-dual-entropic>`
 
 
     Parameters
@@ -419,13 +417,11 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
            [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
            [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
+
+    .. _references-solve-semi-dual-entropic:
     References
     ----------
-
-    [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+    .. [18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) Stochastic Optimization for Large-scale Optimal Transport. Advances in Neural Information Processing Systems (2016).
     '''
 
     if method.lower() == "sag":
@@ -459,26 +455,30 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
     r'''
     Computes the partial gradient of the dual optimal transport problem.
 
-    For each (i,j) in a batch of coordinates, the partial gradients are :
+    For each :math:`(i,j)` in a batch of coordinates, the partial gradients are :
 
     .. math::
-        \partial_{u_i} F = u_i * b_s/l_{v} - \sum_{j \in B_v} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+        \partial_{\mathbf{u}_i} F = \frac{b_s}{l_v} \mathbf{u}_i -
+        \sum_{j \in B_v} \mathbf{a}_i \mathbf{b}_j
+        \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right)
 
-        \partial_{v_j} F = v_j * b_s/l_{u} - \sum_{i \in B_u} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+        \partial_{\mathbf{v}_j} F = \frac{b_s}{l_u} \mathbf{v}_j -
+        \sum_{i \in B_u} \mathbf{a}_i \mathbf{b}_j
+        \exp\left( \frac{\mathbf{u}_i + \mathbf{v}_j - \mathbf{M}_{i,j}}{\mathrm{reg}} \right)
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - u, v are dual variables in R^ixR^J
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{u}`, :math:`\mathbf{v}` are dual variables in :math:`\mathbb{R}^{ns} \times \mathbb{R}^{nt}`
     - reg is the regularization term
     - :math:`B_u` and :math:`B_v` are lists of index
-    - :math:`b_s` is the size of the batchs :math:`B_u` and :math:`B_v`
-    - :math:`l_u` and :math:`l_v` are the lenghts of :math:`B_u` and :math:`B_v`
-    - a and b are source and target weights (sum to 1)
+    - :math:`b_s` is the size of the batches :math:`B_u` and :math:`B_v`
+    - :math:`l_u` and :math:`l_v` are the lengths of :math:`B_u` and :math:`B_v`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
 
     The algorithm used for solving the dual problem is the SGD algorithm
-    as proposed in [19]_ [alg.1]
+    as proposed in :ref:`[19] <references-batch-grad-dual>` [alg.1]
 
 
     Parameters
@@ -504,7 +504,7 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
     Returns
     -------
-    grad : ndarray, shape (ns,)
+    grad : ndarray, shape (`ns`,)
         partial grad F
 
     Examples
@@ -533,12 +533,11 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
            [5.06266486e-02, 2.16230494e-03, 2.26215141e-03, 6.81514609e-04],
            [6.06713990e-02, 3.98139808e-02, 5.46829338e-02, 8.62371424e-06]])
 
+
+    .. _references-batch-grad-dual:
     References
     ----------
-
-    [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
     G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
                    M[batch_alpha, :][:, batch_beta]) / reg) *
@@ -555,25 +554,25 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
 def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
     r'''
-    Compute the sgd algorithm to solve the regularized discrete measures
-        optimal transport dual problem
+    Compute the sgd algorithm to solve the regularized discrete measures optimal transport dual problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
     - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     Parameters
     ----------
@@ -632,9 +631,7 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     References
     ----------
-    [Seguy et al., 2018] :
-        International Conference on Learning Representation (2018),
-          arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
 
     n_source = np.shape(M)[0]
@@ -657,25 +654,25 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
                         log=False):
     r'''
-    Compute the transportation matrix to solve the regularized discrete measures
-        optimal transport dual problem
+    Compute the transportation matrix to solve the regularized discrete measures optimal transport dual problem
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
+        \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
 
              \gamma \geq 0
 
     Where :
 
-    - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target weights (sum to 1)
 
     Parameters
     ----------
@@ -736,10 +733,7 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     References
     ----------
-
-    [Seguy et al., 2018] :
-        International Conference on Learning Representation (2018),
-          arXiv preprint arxiv:1711.02283.
+    .. [19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. Large-scale Optimal Transport and Mapping Estimation. International Conference on Learning Representation (2018)
     '''
 
     opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
index 6a61aa1..15e180b 100644
--- a/ot/unbalanced.py
+++ b/ot/unbalanced.py
@@ -23,29 +23,31 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+        W = \min_\gamma \ \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b})
 
         s.t.
-             \gamma\geq 0
+             \gamma \geq 0
+
     where :
 
-    - M is the (dim_a, dim_b) metric cost matrix
-    - :math:`\Omega` is the entropic regularization
-        term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target unbalanced distributions
+    - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - KL is the Kullback-Leibler divergence
 
     The algorithm used for solving the problem is the generalized
-        Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] <references-sinkhorn-unbalanced>`
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
-        One or multiple unnormalized histograms of dimension dim_b
-        If many, compute all the OT distances (a, b_i)
+        One or multiple unnormalized histograms of dimension `dim_b`.
+        If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i`
     M : np.ndarray (dim_a, dim_b)
         loss matrix
     reg : float
@@ -68,14 +70,14 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000,
     Returns
     -------
     if n_hists == 1:
-        gamma : (dim_a x dim_b) ndarray
+        - gamma : (dim_a, dim_b) ndarray
             Optimal transportation matrix for the given parameters
-        log : dict
+        - log : dict
             log dictionary returned only if `log` is `True`
     else:
-        ot_distance : (n_hists,) ndarray
-            the OT distance between `a` and each of the histograms `b_i`
-        log : dict
+        - ot_distance : (n_hists,) ndarray
+            the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i`
+        - log : dict
             log dictionary returned only if `log` is `True`
 
     Examples
@@ -90,9 +92,9 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000,
            [0.18807035, 0.51122823]])
 
 
+    .. _references-sinkhorn-unbalanced:
     References
     ----------
-
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
         Transport, Advances in Neural Information Processing Systems
         (NIPS) 26, 2013
@@ -111,11 +113,11 @@ def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000,
 
     See Also
     --------
-    ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn [10]
+    ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn :ref:`[10] <references-sinkhorn-unbalanced>`
     ot.unbalanced.sinkhorn_stabilized_unbalanced:
-        Unbalanced Stabilized sinkhorn [9][10]
+        Unbalanced Stabilized sinkhorn :ref:`[9, 10] <references-sinkhorn-unbalanced>`
     ot.unbalanced.sinkhorn_reg_scaling_unbalanced:
-        Unbalanced Sinkhorn with epslilon scaling [9][10]
+        Unbalanced Sinkhorn with epslilon scaling :ref:`[9, 10] <references-sinkhorn-unbalanced>`
 
     """
 
@@ -151,29 +153,30 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn',
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+        W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b})
 
         s.t.
              \gamma\geq 0
     where :
 
-    - M is the (dim_a, dim_b) metric cost matrix
-    - :math:`\Omega` is the entropic regularization term
-        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target unbalanced distributions
+    - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - KL is the Kullback-Leibler divergence
 
     The algorithm used for solving the problem is the generalized
-    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] <references-sinkhorn-unbalanced2>`
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
-        One or multiple unnormalized histograms of dimension dim_b
-        If many, compute all the OT distances (a, b_i)
+        One or multiple unnormalized histograms of dimension `dim_b`.
+        If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i`
     M : np.ndarray (dim_a, dim_b)
         loss matrix
     reg : float
@@ -196,7 +199,7 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn',
     Returns
     -------
     ot_distance : (n_hists,) ndarray
-        the OT distance between `a` and each of the histograms `b_i`
+        the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i`
     log : dict
         log dictionary returned only if `log` is `True`
 
@@ -211,10 +214,9 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn',
     array([0.31912866])
 
 
-
+    .. _references-sinkhorn-unbalanced2:
     References
     ----------
-
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
         Transport, Advances in Neural Information Processing Systems
         (NIPS) 26, 2013
@@ -232,9 +234,9 @@ def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn',
 
     See Also
     --------
-    ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn [10]
-    ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn [9][10]
-    ot.unbalanced.sinkhorn_reg_scaling: Unbalanced Sinkhorn with epslilon scaling [9][10]
+    ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn :ref:`[10] <references-sinkhorn-unbalanced2>`
+    ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn :ref:`[9, 10] <references-sinkhorn-unbalanced2>`
+    ot.unbalanced.sinkhorn_reg_scaling: Unbalanced Sinkhorn with epslilon scaling :ref:`[9, 10] <references-sinkhorn-unbalanced2>`
 
     """
     b = np.asarray(b, dtype=np.float64)
@@ -270,26 +272,29 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \reg_m KL(\gamma 1, a) + \reg_m KL(\gamma^T 1, b)
+        W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b})
 
         s.t.
-             \gamma\geq 0
+             \gamma \geq 0
+
     where :
 
-    - M is the (dim_a, dim_b) metric cost matrix
-    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target unbalanced distributions
+    - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - KL is the Kullback-Leibler divergence
 
-    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] <references-sinkhorn-knopp-unbalanced>`
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
-        One or multiple unnormalized histograms of dimension dim_b
+        One or multiple unnormalized histograms of dimension `dim_b`
         If many, compute all the OT distances (a, b_i)
     M : np.ndarray (dim_a, dim_b)
         loss matrix
@@ -310,15 +315,16 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
     Returns
     -------
     if n_hists == 1:
-        gamma : (dim_a x dim_b) ndarray
+        - gamma : (dim_a, dim_b) ndarray
             Optimal transportation matrix for the given parameters
-        log : dict
+        - log : dict
             log dictionary returned only if `log` is `True`
     else:
-        ot_distance : (n_hists,) ndarray
-            the OT distance between `a` and each of the histograms `b_i`
-        log : dict
+        - ot_distance : (n_hists,) ndarray
+            the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i`
+        - log : dict
             log dictionary returned only if `log` is `True`
+
     Examples
     --------
 
@@ -330,9 +336,10 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
     array([[0.51122823, 0.18807035],
            [0.18807035, 0.51122823]])
 
+
+    .. _references-sinkhorn-knopp-unbalanced:
     References
     ----------
-
     .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
         Scaling algorithms for unbalanced transport problems. arXiv preprint
         arXiv:1607.05816.
@@ -445,32 +452,34 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000
     problem and return the loss
 
     The function solves the following optimization problem using log-domain
-    stabilization as proposed in [10]:
+    stabilization as proposed in :ref:`[10] <references-sinkhorn-stabilized-unbalanced>`:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+        W = \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F + \mathrm{reg}\cdot\Omega(\gamma) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma \mathbf{1}, \mathbf{a}) +
+        \mathrm{reg_m} \cdot \mathrm{KL}(\gamma^T \mathbf{1}, \mathbf{b})
 
         s.t.
-             \gamma\geq 0
+             \gamma \geq 0
+
     where :
 
-    - M is the (dim_a, dim_b) metric cost matrix
-    - :math:`\Omega` is the entropic regularization
-        term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target unbalanced distributions
+    - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term, :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - KL is the Kullback-Leibler divergence
 
     The algorithm used for solving the problem is the generalized
-    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10, 25] <references-sinkhorn-stabilized-unbalanced>`
 
 
     Parameters
     ----------
     a : np.ndarray (dim_a,)
-        Unnormalized histogram of dimension dim_a
+        Unnormalized histogram of dimension `dim_a`
     b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
-        One or multiple unnormalized histograms of dimension dim_b
-        If many, compute all the OT distances (a, b_i)
+        One or multiple unnormalized histograms of dimension `dim_b`.
+        If many, compute all the OT distances :math:`(\mathbf{a}, \mathbf{b}_i)_i`
     M : np.ndarray (dim_a, dim_b)
         loss matrix
     reg : float
@@ -492,14 +501,14 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000
     Returns
     -------
     if n_hists == 1:
-        gamma : (dim_a x dim_b) ndarray
+        - gamma : (dim_a, dim_b) ndarray
             Optimal transportation matrix for the given parameters
-        log : dict
+        - log : dict
             log dictionary returned only if `log` is `True`
     else:
-        ot_distance : (n_hists,) ndarray
-            the OT distance between `a` and each of the histograms `b_i`
-        log : dict
+        - ot_distance : (n_hists,) ndarray
+            the OT distance between :math:`\mathbf{a}` and each of the histograms :math:`\mathbf{b}_i`
+        - log : dict
             log dictionary returned only if `log` is `True`
     Examples
     --------
@@ -512,9 +521,10 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000
     array([[0.51122823, 0.18807035],
            [0.18807035, 0.51122823]])
 
+
+    .. _references-sinkhorn-stabilized-unbalanced:
     References
     ----------
-
     .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
         Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
 
@@ -654,29 +664,27 @@ def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000
 def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
                                      numItermax=1000, stopThr=1e-6,
                                      verbose=False, log=False):
-    r"""Compute the entropic unbalanced wasserstein barycenter of A with stabilization.
+    r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}` with stabilization.
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
-    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
-        Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
-    - :math:`\mathbf{a}_i` are training distributions in the columns of
-        matrix :math:`\mathbf{A}`
-    - reg and :math:`\mathbf{M}` are respectively the regularization term and
-        the cost matrix for OT
+    - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
     - reg_mis the marginal relaxation hyperparameter
-        The algorithm used for solving the problem is the generalized
-        Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+
+    The algorithm used for solving the problem is the generalized
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] <references-barycenter-unbalanced-stabilized>`
 
     Parameters
     ----------
     A : np.ndarray (dim, n_hists)
-        `n_hists` training distributions a_i of dimension dim
+        `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim`
     M : np.ndarray (dim, dim)
         ground metric matrix for OT.
     reg : float
@@ -706,9 +714,9 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
         log dictionary return only if log==True in parameters
 
 
+    .. _references-barycenter-unbalanced-stabilized:
     References
     ----------
-
     .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré,
         G. (2015). Iterative Bregman projections for regularized transportation
         problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
@@ -806,29 +814,27 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
 def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
                                    numItermax=1000, stopThr=1e-6,
                                    verbose=False, log=False):
-    r"""Compute the entropic unbalanced wasserstein barycenter of A.
+    r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}`.
 
-     The function solves the following optimization problem with a
+     The function solves the following optimization problem with :math:`\mathbf{a}`
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
-    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
-    Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
-    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
-    - reg and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+    - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
     - reg_mis the marginal relaxation hyperparameter
+
     The algorithm used for solving the problem is the generalized
-    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] <references-barycenter-unbalanced-sinkhorn>`
 
     Parameters
     ----------
     A : np.ndarray (dim, n_hists)
-        `n_hists` training distributions a_i of dimension dim
+        `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim`
     M : np.ndarray (dim, dim)
         ground metric matrix for OT.
     reg : float
@@ -856,9 +862,9 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
         log dictionary return only if log==True in parameters
 
 
+    .. _references-barycenter-unbalanced-sinkhorn:
     References
     ----------
-
     .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
         (2015). Iterative Bregman projections for regularized transportation
         problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
@@ -936,29 +942,27 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
 def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None,
                           numItermax=1000, stopThr=1e-6,
                           verbose=False, log=False, **kwargs):
-    r"""Compute the entropic unbalanced wasserstein barycenter of A.
+    r"""Compute the entropic unbalanced wasserstein barycenter of :math:`\mathbf{A}`.
 
-     The function solves the following optimization problem with a
+     The function solves the following optimization problem with :math:`\mathbf{a}`
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \quad \sum_i W_{u_{reg}}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
-    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
-    Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
-    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
-    :math:`\mathbf{A}`
-    - reg and :math:`\mathbf{M}` are respectively the regularization term and
-    the cost matrix for OT
+    - :math:`W_{u_{reg}}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see :py:func:`ot.unbalanced.sinkhorn_unbalanced`)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
     - reg_mis the marginal relaxation hyperparameter
+
     The algorithm used for solving the problem is the generalized
-    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in :ref:`[10] <references-barycenter-unbalanced>`
 
     Parameters
     ----------
     A : np.ndarray (dim, n_hists)
-        `n_hists` training distributions a_i of dimension dim
+        `n_hists` training distributions :math:`\mathbf{a}_i` of dimension `dim`
     M : np.ndarray (dim, dim)
         ground metric matrix for OT.
     reg : float
@@ -986,9 +990,9 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None,
         log dictionary return only if log==True in parameters
 
 
+    .. _references-barycenter-unbalanced:
     References
     ----------
-
     .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
         (2015). Iterative Bregman projections for regularized transportation
         problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
diff --git a/ot/utils.py b/ot/utils.py
index 0608aee..c878563 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -21,26 +21,26 @@ __time_tic_toc = time.time()
 
 
 def tic():
-    """ Python implementation of Matlab tic() function """
+    r""" Python implementation of Matlab tic() function """
     global __time_tic_toc
     __time_tic_toc = time.time()
 
 
 def toc(message='Elapsed time : {} s'):
-    """ Python implementation of Matlab toc() function """
+    r""" Python implementation of Matlab toc() function """
     t = time.time()
     print(message.format(t - __time_tic_toc))
     return t - __time_tic_toc
 
 
 def toq():
-    """ Python implementation of Julia toc() function """
+    r""" Python implementation of Julia toc() function """
     t = time.time()
     return t - __time_tic_toc
 
 
 def kernel(x1, x2, method='gaussian', sigma=1, **kwargs):
-    """Compute kernel matrix"""
+    r"""Compute kernel matrix"""
 
     nx = get_backend(x1, x2)
 
@@ -50,13 +50,13 @@ def kernel(x1, x2, method='gaussian', sigma=1, **kwargs):
 
 
 def laplacian(x):
-    """Compute Laplacian matrix"""
+    r"""Compute Laplacian matrix"""
     L = np.diag(np.sum(x, axis=0)) - x
     return L
 
 
 def list_to_array(*lst):
-    """ Convert a list if in numpy format """
+    r""" Convert a list if in numpy format """
     if len(lst) > 1:
         return [np.array(a) if isinstance(a, list) else a for a in lst]
     else:
@@ -64,17 +64,18 @@ def list_to_array(*lst):
 
 
 def proj_simplex(v, z=1):
-    r""" compute the closest point (orthogonal projection) on the
-    generalized (n-1)-simplex of a vector v wrt. to the Euclidean
+    r"""Compute the closest point (orthogonal projection) on the
+    generalized `(n-1)`-simplex of a vector :math:`\mathbf{v}` wrt. to the Euclidean
     distance, thus solving:
+
     .. math::
-        \mathcal{P}(w) \in arg\min_\gamma || \gamma - v ||_2
+        \mathcal{P}(w) \in \mathop{\arg \min}_\gamma \| \gamma - \mathbf{v} \|_2
 
-        s.t. \gamma^T 1= z
+        s.t. \ \gamma^T \mathbf{1} = z
 
-             \gamma\geq 0
+             \gamma \geq 0
 
-    If v is a 2d array, compute all the projections wrt. axis 0
+    If :math:`\mathbf{v}` is a 2d array, compute all the projections wrt. axis 0
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
@@ -87,7 +88,7 @@ def proj_simplex(v, z=1):
 
     Returns
     -------
-    h : ndarray, shape (n,d)
+    h : ndarray, shape (`n`, `d`)
         Array of projections on the simplex
     """
     nx = get_backend(v)
@@ -116,26 +117,24 @@ def proj_simplex(v, z=1):
 
 
 def unif(n):
-    """ return a uniform histogram of length n (simplex)
+    r"""
+    Return a uniform histogram of length `n` (simplex).
 
     Parameters
     ----------
-
     n : int
         number of bins in the histogram
 
     Returns
     -------
-    h : np.array (n,)
-        histogram of length n such that h_i=1/n for all i
-
-
+    h : np.array (`n`,)
+        histogram of length `n` such that :math:`\forall i, \mathbf{h}_i = \frac{1}{n}`
     """
     return np.ones((n,)) / n
 
 
 def clean_zeros(a, b, M):
-    """ Remove all components with zeros weights in a and b
+    r""" Remove all components with zeros weights in :math:`\mathbf{a}` and :math:`\mathbf{b}`
     """
     M2 = M[a > 0, :][:, b > 0].copy()  # copy force c style matrix (froemd)
     a2 = a[a > 0]
@@ -144,8 +143,8 @@ def clean_zeros(a, b, M):
 
 
 def euclidean_distances(X, Y, squared=False):
-    """
-    Considering the rows of X (and Y=X) as vectors, compute the
+    r"""
+    Considering the rows of :math:`\mathbf{X}` (and :math:`\mathbf{Y} = \mathbf{X}`) as vectors, compute the
     distance matrix between each pair of vectors.
 
     .. note:: This function is backend-compatible and will work on arrays
@@ -153,14 +152,14 @@ def euclidean_distances(X, Y, squared=False):
 
     Parameters
     ----------
-    X : {array-like}, shape (n_samples_1, n_features)
-    Y : {array-like}, shape (n_samples_2, n_features)
+    X : array-like, shape (n_samples_1, n_features)
+    Y : array-like, shape (n_samples_2, n_features)
     squared : boolean, optional
         Return squared Euclidean distances.
 
     Returns
     -------
-    distances : {array}, shape (n_samples_1, n_samples_2)
+    distances : array-like, shape (`n_samples_1`, `n_samples_2`)
     """
 
     nx = get_backend(X, Y)
@@ -184,7 +183,7 @@ def euclidean_distances(X, Y, squared=False):
 
 
 def dist(x1, x2=None, metric='sqeuclidean', p=2):
-    """Compute distance between samples in x1 and x2
+    r"""Compute distance between samples in :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends.
@@ -193,9 +192,9 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2):
     ----------
 
     x1 : array-like, shape (n1,d)
-        matrix with n1 samples of size d
+        matrix with `n1` samples of size `d`
     x2 : array-like, shape (n2,d), optional
-        matrix with n2 samples of size d (if None then x2=x1)
+        matrix with `n2` samples of size `d` (if None then :math:`\mathbf{x_2} = \mathbf{x_1}`)
     metric : str | callable, optional
         'sqeuclidean' or 'euclidean' on all backends. On numpy the function also
         accepts  from the scipy.spatial.distance.cdist function : 'braycurtis',
@@ -208,7 +207,7 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2):
     Returns
     -------
 
-    M : array-like, shape (n1, n2)
+    M : array-like, shape (`n1`, `n2`)
         distance matrix computed with given metric
 
     """
@@ -226,7 +225,7 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2):
 
 
 def dist0(n, method='lin_square'):
-    """Compute standard cost matrices of size (n, n) for OT problems
+    r"""Compute standard cost matrices of size (`n`, `n`) for OT problems
 
     Parameters
     ----------
@@ -235,11 +234,11 @@ def dist0(n, method='lin_square'):
     method : str, optional
         Type of loss matrix chosen from:
 
-        * 'lin_square' : linear sampling between 0 and n-1, quadratic loss
+        * 'lin_square' : linear sampling between 0 and `n-1`, quadratic loss
 
     Returns
     -------
-    M : ndarray, shape (n1,n2)
+    M : ndarray, shape (`n1`, `n2`)
         Distance matrix computed with given metric.
     """
     res = 0
@@ -250,7 +249,7 @@ def dist0(n, method='lin_square'):
 
 
 def cost_normalization(C, norm=None):
-    """ Apply normalization to the loss matrix
+    r""" Apply normalization to the loss matrix
 
     Parameters
     ----------
@@ -262,7 +261,7 @@ def cost_normalization(C, norm=None):
 
     Returns
     -------
-    C : ndarray, shape (n1, n2)
+    C : ndarray, shape (`n1`, `n2`)
         The input cost matrix normalized according to given norm.
     """
 
@@ -284,23 +283,23 @@ def cost_normalization(C, norm=None):
 
 
 def dots(*args):
-    """ dots function for multiple matrix multiply """
+    r""" dots function for multiple matrix multiply """
     return reduce(np.dot, args)
 
 
 def label_normalization(y, start=0):
-    """ Transform labels to start at a given value
+    r""" Transform labels to start at a given value
 
     Parameters
     ----------
     y : array-like, shape (n, )
         The vector of labels to be normalized.
     start : int
-        Desired value for the smallest label in y (default=0)
+        Desired value for the smallest label in :math:`\mathbf{y}` (default=0)
 
     Returns
     -------
-    y : array-like, shape (n1, )
+    y : array-like, shape (`n1`, )
         The input vector of labels normalized according to given start value.
     """
 
@@ -311,14 +310,14 @@ def label_normalization(y, start=0):
 
 
 def parmap(f, X, nprocs="default"):
-    """ paralell map for multiprocessing.
+    r""" parallel map for multiprocessing.
     The function has been deprecated and only performs a regular map.
     """
     return list(map(f, X))
 
 
 def check_params(**kwargs):
-    """check_params: check whether some parameters are missing
+    r"""check_params: check whether some parameters are missing
     """
 
     missing_params = []
@@ -339,14 +338,14 @@ def check_params(**kwargs):
 
 
 def check_random_state(seed):
-    """Turn seed into a np.random.RandomState instance
+    r"""Turn `seed` into a np.random.RandomState instance
 
     Parameters
     ----------
     seed : None | int | instance of RandomState
-        If seed is None, return the RandomState singleton used by np.random.
-        If seed is an int, return a new RandomState instance seeded with seed.
-        If seed is already a RandomState instance, return it.
+        If `seed` is None, return the RandomState singleton used by np.random.
+        If `seed` is an int, return a new RandomState instance seeded with `seed`.
+        If `seed` is already a RandomState instance, return it.
         Otherwise raise ValueError.
     """
     if seed is None or seed is np.random:
@@ -360,18 +359,21 @@ def check_random_state(seed):
 
 
 class deprecated(object):
-    """Decorator to mark a function or class as deprecated.
+    r"""Decorator to mark a function or class as deprecated.
 
     deprecated class from scikit-learn package
     https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/deprecation.py
     Issue a warning when the function is called/the class is instantiated and
     adds a warning to the docstring.
     The optional extra argument will be appended to the deprecation message
-    and the docstring. Note: to use this with the default value for extra, put
-    in an empty of parentheses:
-    >>> from ot.deprecation import deprecated  # doctest: +SKIP
-    >>> @deprecated()  # doctest: +SKIP
-    ... def some_function(): pass  # doctest: +SKIP
+    and the docstring.
+
+    .. note::
+        To use this with the default value for extra, use empty parentheses:
+
+        >>> from ot.deprecation import deprecated  # doctest: +SKIP
+        >>> @deprecated()  # doctest: +SKIP
+        ... def some_function(): pass  # doctest: +SKIP
 
     Parameters
     ----------
@@ -386,7 +388,7 @@ class deprecated(object):
         self.extra = extra
 
     def __call__(self, obj):
-        """Call method
+        r"""Call method
         Parameters
         ----------
         obj : object
@@ -417,7 +419,7 @@ class deprecated(object):
         return cls
 
     def _decorate_fun(self, fun):
-        """Decorate function fun"""
+        r"""Decorate function fun"""
 
         msg = "Function %s is deprecated" % fun.__name__
         if self.extra:
@@ -443,7 +445,7 @@ class deprecated(object):
 
 
 def _is_deprecated(func):
-    """Helper to check if func is wraped by our deprecated decorator"""
+    r"""Helper to check if func is wraped by our deprecated decorator"""
     if sys.version_info < (3, 5):
         raise NotImplementedError("This is only available for python3.5 "
                                   "or above")
@@ -457,7 +459,7 @@ def _is_deprecated(func):
 
 
 class BaseEstimator(object):
-    """Base class for most objects in POT
+    r"""Base class for most objects in POT
 
     Code adapted from sklearn BaseEstimator class
 
@@ -470,7 +472,7 @@ class BaseEstimator(object):
 
     @classmethod
     def _get_param_names(cls):
-        """Get parameter names for the estimator"""
+        r"""Get parameter names for the estimator"""
 
         # fetch the constructor or the original constructor before
         # deprecation wrapping if any
@@ -497,7 +499,7 @@ class BaseEstimator(object):
         return sorted([p.name for p in parameters])
 
     def get_params(self, deep=True):
-        """Get parameters for this estimator.
+        r"""Get parameters for this estimator.
 
         Parameters
         ----------
@@ -534,7 +536,7 @@ class BaseEstimator(object):
         return out
 
     def set_params(self, **params):
-        """Set the parameters of this estimator.
+        r"""Set the parameters of this estimator.
 
         The method works on simple estimators as well as on nested objects
         (such as pipelines). The latter have parameters of the form
@@ -574,7 +576,7 @@ class BaseEstimator(object):
 
 
 class UndefinedParameter(Exception):
-    """
+    r"""
     Aim at raising an Exception when a undefined parameter is called
 
     """
-- 
cgit v1.2.3