11 files changed, 205 insertions, 104 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index 1b3c2fb..35ae6fc 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -1,6 +1,46 @@
-"""Python Optimal Transport toolbox
+"""
+
+This is the main module of the POT toolbox. It provides easy access to 
+a number of sub-modules and functions described below. 
+
+.. note::
+
+
+    Here is a list of the submodules and short description of what they contain.
+
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.bregman` contains OT solvers for the entropic OT problems using
+      Bregman projections.
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT
+      problems.
+    - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov 
+      Wasserstein problems.
+    - :any:`ot.optim` contains generic solvers OT based optimization problems 
+    - :any:`ot.da` contains classes and function related to Monge mapping
+      estimation and Domain Adaptation (DA).
+    - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers
+    - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein 
+      Discriminant Analysis.
+    - :any:`ot.utils` contains utility functions such as distance computation and 
+      timing.  
+    - :any:`ot.datasets` contains toy dataset generation functions.
+    - :any:`ot.plot` contains visualization functions
+    - :any:`ot.stochastic` contains stochastic solvers for regularized OT.
+    - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT.
+
+.. warning::
+    The list of automatically imported sub-modules is as follows: 
+    :py:mod:`ot.lp`, :py:mod:`ot.bregman`, :py:mod:`ot.optim`
+    :py:mod:`ot.utils`, :py:mod:`ot.datasets`,
+    :py:mod:`ot.gromov`, :py:mod:`ot.smooth`
+    :py:mod:`ot.stochastic`    
 
+    The following sub-modules are not imported due to additional dependencies:
 
+    - :any:`ot.dr` : depends on :code:`pymanopt` and :code:`autograd`.
+    - :any:`ot.gpu` : depends on :code:`cupy` and a CUDA GPU.
+    - :any:`ot.plot` : depends on :code:`matplotlib`
 
 """
 
diff --git a/ot/da.py b/ot/da.py
index 479e698..83f9027 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -41,15 +41,15 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term
-        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term
+    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e
+      (\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regularization term
       :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
       where  :math:`\mathcal{I}_c` are the index of samples from class c
       in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional
+    The algorithm used for solving the problem is the generalized conditional
     gradient as proposed in  [5]_ [7]_
 
 
diff --git a/ot/dr.py b/ot/dr.py
index d30ab30..d2bf6e2 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -1,6 +1,12 @@
 # -*- coding: utf-8 -*-
 """
 Dimension reduction with optimal transport
+
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.dr`
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index deda6b1..1ab95bb 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -5,11 +5,15 @@ This module provides GPU implementation for several OT solvers and utility
 functions. The GPU backend in handled by `cupy 
 <https://cupy.chainer.org/>`_.
 
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to 
+    use it you need to explicitely import :mod:`ot.gpu` .
+
 By default, the functions in this module accept and return numpy arrays 
 in order to proide drop-in replacement for the other POT function but
 the transfer between CPU en GPU comes with a significant overhead.
 
-In order to get the best erformances, we recommend to give only cupy 
+In order to get the best performances, we recommend to give only cupy 
 arrays to the functions and desactivate the conversion to numpy of the 
 result of the function with parameter ``to_numpy=False``.
 
diff --git a/ot/gromov.py b/ot/gromov.py
index ca96b31..3a7e24c 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -72,8 +72,8 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -137,8 +137,8 @@ def tensor_product(constC, hC1, hC2, T):
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     A = -np.dot(hC1, T).dot(hC2.T)
@@ -172,8 +172,8 @@ def gwloss(constC, hC1, hC2, T):
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -207,8 +207,8 @@ def gwggrad(constC, hC1, hC2, T):
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     return 2 * tensor_product(constC, hC1, hC2,
@@ -277,15 +277,15 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
@@ -312,7 +312,7 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        parameters can be directly passed to the ot.optim.cg solver
 
     Returns
     -------
@@ -355,25 +355,31 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
 def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     """
     Computes the FGW transport between two graphs see [24]
+
     .. math::
-        \gamma = arg\min_\gamma (1-\alpha)*<\gamma,M>_F + alpha* \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \gamma = arg\min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
         s.t. \gamma 1 = p
              \gamma^T 1= q
              \gamma\geq 0
+
     where :
     - M is the (ns,nt) metric cost matrix
     - :math:`f` is the regularization term ( and df is its gradient)
     - a and b are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
-    The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
+    The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
+
     Parameters
     ----------
     M  : ndarray, shape (ns, nt)
          Metric cost matrix between features across domains
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix respresentative of the structure in the source space
+         Metric cost matrix representative of the structure in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix espresentative of the structure in the target space
+         Metric cost matrix representative of the structure in the target space
     p :  ndarray, shape (ns,)
          distribution in the source space
     q :  ndarray, shape (nt,)
@@ -392,19 +398,23 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        parameters can be directly passed to the ot.optim.cg solver
+
     Returns
     -------
     gamma : (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
+
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
-        "Optimal Transport for structured data with application on graphs"
-        International Conference on Machine Learning (ICML). 2019.
+        and Courty Nicolas "Optimal Transport for structured data with
+        application on graphs", International Conference on Machine Learning
+        (ICML). 2019.
+
     """
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
@@ -428,17 +438,23 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     """
     Computes the FGW distance between two graphs see [24]
+
     .. math::
-        \gamma = arg\min_\gamma (1-\alpha)*<\gamma,M>_F + alpha* \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+
         s.t. \gamma 1 = p
              \gamma^T 1= q
              \gamma\geq 0
+
     where :
     - M is the (ns,nt) metric cost matrix
     - :math:`f` is the regularization term ( and df is its gradient)
     - a and b are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
     The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
     Parameters
     ----------
     M  : ndarray, shape (ns, nt)
@@ -466,16 +482,18 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
         If there is convergence issues use False.
     **kwargs : dict
         parameters can be directly pased to the ot.optim.cg solver
+
     Returns
     -------
     gamma : (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+        and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
@@ -506,22 +524,22 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
          Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+         Metric cost matrix in the target space
     p :  ndarray, shape (ns,)
          distribution in the source space
     q :  ndarray, shape (nt,)
@@ -587,21 +605,21 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
-        s.t. \GW 1 = p
+        s.t. T 1 = p
 
-             \GW^T 1= q
+             T^T 1= q
 
-             \GW\geq 0
+             T\geq 0
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
@@ -629,14 +647,13 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
-            \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        Optimal coupling between the two spaces
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -695,15 +712,15 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
@@ -736,8 +753,8 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -762,13 +779,13 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        C = argmin_C\in R^{NxN} \sum_s \lambda_s GW(C,Cs,p,ps)
+        C = argmin_{C\in R^{NxN}} \sum_s \lambda_s GW(C,C_s,p,p_s)
 
 
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - :math:`C_s` : metric cost matrix
+    - :math:`p_s`  : distribution
 
     Parameters
     ----------
@@ -806,8 +823,8 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -876,8 +893,8 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - Cs : metric cost matrix
+    - ps  : distribution
 
     Parameters
     ----------
@@ -913,8 +930,8 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -972,6 +989,8 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
                     verbose=False, log=False, init_C=None, init_X=None):
     """
     Compute the fgw barycenter as presented eq (5) in [24].
+
+    Parameters
     ----------
     N : integer
         Desired number of samples of the target barycenter
@@ -993,8 +1012,9 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
               initialization for the barycenters' structure matrix. If not set random init
     init_X :  ndarray, shape (N,d), optional
               initialization for the barycenters' features. If not set random init
+
     Returns
-    ----------
+    -------
     X : ndarray, shape (N,d)
         Barycenters' features
     C : ndarray, shape (N,N)
@@ -1002,11 +1022,13 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
     log_: dictionary
         Only returned when log=True
         T : list of (N,ns) transport matrices
-        Ms : all distance matrices between the feature of the barycenter and the other features dist(X,Ys) shape (N,ns)
+        Ms : all distance matrices between the feature of the barycenter and the
+        other features dist(X,Ys) shape (N,ns)
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+        and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
@@ -1107,6 +1129,7 @@ def update_sructure_matrix(p, lambdas, T, Cs):
     """
     Updates C according to the L2 Loss kernel with the S Ts couplings
     calculated at each iteration
+
     Parameters
     ----------
     p  : ndarray, shape (N,)
@@ -1117,6 +1140,7 @@ def update_sructure_matrix(p, lambdas, T, Cs):
         the S Ts couplings calculated at each iteration
     Cs : list of S ndarray, shape(ns,ns)
          Metric cost matrices
+
     Returns
     ----------
     C : ndarray, shape (nt,nt)
@@ -1132,6 +1156,7 @@ def update_feature_matrix(lambdas, Ys, Ts, p):
     """
     Updates the feature with respect to the S Ts couplings. See "Solving the barycenter problem with Block Coordinate Descent (BCD)" in [24]
     calculated at each iteration
+
     Parameters
     ----------
     p  : ndarray, shape (N,)
@@ -1142,6 +1167,7 @@ def update_feature_matrix(lambdas, Ys, Ts, p):
         the S Ts couplings calculated at each iteration
     Ys : list of S ndarray, shape(d,ns)
          The features
+
     Returns
     ----------
     X : ndarray, shape (d,N)
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 8ec286b..17f1731 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
 Solvers for the original linear program OT problem
+
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -39,26 +42,30 @@ def emd(a, b, M, numItermax=100000, log=False):
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
-    log: boolean, optional (default=False)
+    log: bool, optional (default=False)
         If True, returns a dictionary containing the cost and dual
         variables. Otherwise returns only the optimal transportation matrix.
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
     log: dict
         If input log is true, a dictionary containing the cost and dual
@@ -130,16 +137,20 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
@@ -153,7 +164,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
-    log: dict
+    log: dictnp
         If input log is true, a dictionary containing the cost and dual
         variables and exit status
 
@@ -233,9 +244,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     Parameters
     ----------
-    measures_locations : list of (k_i,d) np.ndarray
+    measures_locations : list of (k_i,d) numpy.ndarray
         The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
-    measures_weights : list of (k_i,) np.ndarray
+    measures_weights : list of (k_i,) numpy.ndarray
         Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
@@ -248,7 +259,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -533,14 +544,13 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
     r"""Solves the p-Wasserstein distance problem between 1d measures and returns
     the distance
 
-
     .. math::
-        \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij}
-            |x_a[i] - x_b[j]|^p \\right)^{1/p}
+        \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
 
         s.t. \gamma 1 = a,
              \gamma^T 1= b,
              \gamma\geq 0
+
     where :
 
     - x_a and x_b are the samples
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 8a4aec9..2b6c495 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -58,13 +58,16 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order :py.cls:`numpy.array`
+
     Parameters
     ----------
-    a : (ns,) ndarray, float64
+    a : (ns,) numpy.ndarray, float64
         source histogram
-    b : (nt,) ndarray, float64
+    b : (nt,) numpy.ndarray, float64
         target histogram
-    M : (ns,nt) ndarray, float64
+    M : (ns,nt) numpy.ndarray, float64
         loss matrix
     max_iter : int
         The maximum number of iterations before stopping the optimization
@@ -73,7 +76,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
 
     """
diff --git a/ot/plot.py b/ot/plot.py
index 784a372..a409d4a 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -1,5 +1,11 @@
 """
 Functions for plotting OT matrices
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.plot`
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
diff --git a/ot/stochastic.py b/ot/stochastic.py
index bf3e7a7..5754968 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -1,3 +1,9 @@
+"""
+Stochastic solvers for regularized OT.
+
+
+"""
+
 # Author: Kilian Fatras <kilian.fatras@gmail.com>
 #
 # License: MIT License
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
index 44ab411..50ec03c 100644
--- a/ot/unbalanced.py
+++ b/ot/unbalanced.py
@@ -20,7 +20,7 @@ def sinkhorn_unbalanced(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \alpha KL(\gamma 1, a) + \alpha KL(\gamma^T 1, b)
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
 
         s.t.
              \gamma\geq 0
@@ -129,7 +129,7 @@ def sinkhorn_unbalanced2(a, b, M, reg, alpha, method='sinkhorn',
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \alpha KL(\gamma 1, a) + \alpha KL(\gamma^T 1, b)
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
 
         s.t.
              \gamma\geq 0
@@ -240,7 +240,7 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, alpha, numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \alpha KL(\gamma 1, a) + \alpha KL(\gamma^T 1, b)
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
 
         s.t.
              \gamma\geq 0
diff --git a/ot/utils.py b/ot/utils.py
index f21ceb9..e8249ef 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Various function that can be usefull
+Various useful functions
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>