15 files changed, 1672 insertions, 717 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index b74b924..35ae6fc 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -1,6 +1,46 @@
-"""Python Optimal Transport toolbox
+"""
+
+This is the main module of the POT toolbox. It provides easy access to 
+a number of sub-modules and functions described below. 
+
+.. note::
+
+
+    Here is a list of the submodules and short description of what they contain.
+
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.bregman` contains OT solvers for the entropic OT problems using
+      Bregman projections.
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT
+      problems.
+    - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov 
+      Wasserstein problems.
+    - :any:`ot.optim` contains generic solvers OT based optimization problems 
+    - :any:`ot.da` contains classes and function related to Monge mapping
+      estimation and Domain Adaptation (DA).
+    - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers
+    - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein 
+      Discriminant Analysis.
+    - :any:`ot.utils` contains utility functions such as distance computation and 
+      timing.  
+    - :any:`ot.datasets` contains toy dataset generation functions.
+    - :any:`ot.plot` contains visualization functions
+    - :any:`ot.stochastic` contains stochastic solvers for regularized OT.
+    - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT.
+
+.. warning::
+    The list of automatically imported sub-modules is as follows: 
+    :py:mod:`ot.lp`, :py:mod:`ot.bregman`, :py:mod:`ot.optim`
+    :py:mod:`ot.utils`, :py:mod:`ot.datasets`,
+    :py:mod:`ot.gromov`, :py:mod:`ot.smooth`
+    :py:mod:`ot.stochastic`    
 
+    The following sub-modules are not imported due to additional dependencies:
 
+    - :any:`ot.dr` : depends on :code:`pymanopt` and :code:`autograd`.
+    - :any:`ot.gpu` : depends on :code:`cupy` and a CUDA GPU.
+    - :any:`ot.plot` : depends on :code:`matplotlib`
 
 """
 
@@ -20,10 +60,12 @@ from . import da
 from . import gromov
 from . import smooth
 from . import stochastic
+from . import unbalanced
 
 # OT functions
-from .lp import emd, emd2
+from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d
 from .bregman import sinkhorn, sinkhorn2, barycenter
+from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced
 from .da import sinkhorn_lpl1_mm
 
 # utils functions
@@ -31,6 +73,8 @@ from .utils import dist, unif, tic, toc, toq
 
 __version__ = "0.5.1"
 
-__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
+__all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
-           'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim']
+           'emd_1d', 'emd2_1d', 'wasserstein_1d',
+           'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim',
+           'sinkhorn_unbalanced', "barycenter_unbalanced"]
diff --git a/ot/bregman.py b/ot/bregman.py
index 321712b..70e4208 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -16,7 +16,7 @@ from .utils import unif, dist
 
 def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
              stopThr=1e-9, verbose=False, log=False, **kwargs):
-    u"""
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The function solves the following optimization problem:
@@ -40,12 +40,12 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -64,7 +64,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -73,12 +73,12 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -131,7 +131,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
 def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
               stopThr=1e-9, verbose=False, log=False, **kwargs):
-    u"""
+    r"""
     Solve the entropic regularization optimal transport problem and return the loss
 
     The function solves the following optimization problem:
@@ -155,12 +155,12 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -176,7 +176,6 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
     W : (nt) ndarray or float
@@ -188,11 +187,11 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn2(a,b,M,1)
-    array([ 0.26894142])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn2(a, b, M, 1)
+    array([0.26894142])
 
 
 
@@ -241,14 +240,14 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     b = np.asarray(b, dtype=np.float64)
     if len(b.shape) < 2:
-        b = b.reshape((-1, 1))
+        b = b[:, None]
 
     return sink()
 
 
 def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
                    stopThr=1e-9, verbose=False, log=False, **kwargs):
-    """
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The function solves the following optimization problem:
@@ -272,12 +271,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -290,10 +289,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -302,12 +300,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -422,7 +420,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
 
 
 def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=False):
-    """
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The algorithm used is based on the paper
@@ -453,12 +451,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (nt,) or ndarray, shape (nt, nbb)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -469,10 +467,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -481,12 +478,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.bregman.greenkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.bregman.greenkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -576,7 +573,7 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
 
 def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
                         warmstart=None, verbose=False, print_period=20, log=False, **kwargs):
-    """
+    r"""
     Solve the entropic regularization OT problem with log stabilization
 
     The function solves the following optimization problem:
@@ -602,11 +599,11 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -623,10 +620,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -639,8 +635,8 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.bregman.sinkhorn_stabilized(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -769,10 +765,14 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
         cpt = cpt + 1
 
-    # print('err=',err,' cpt=',cpt)
     if log:
-        log['logu'] = alpha / reg + np.log(u)
-        log['logv'] = beta / reg + np.log(v)
+        if nbb:
+            alpha = alpha[:, None]
+            beta = beta[:, None]
+        logu = alpha / reg + np.log(u)
+        logv = beta / reg + np.log(v)
+        log['logu'] = logu
+        log['logv'] = logv
         log['alpha'] = alpha + reg * np.log(u)
         log['beta'] = beta + reg * np.log(v)
         log['warmstart'] = (log['alpha'], log['beta'])
@@ -796,7 +796,7 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
 def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInnerItermax=100,
                              tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=10, log=False, **kwargs):
-    """
+    r"""
     Solve the entropic regularization optimal transport problem with log
     stabilization and epsilon scaling.
 
@@ -823,11 +823,11 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -835,7 +835,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
         thershold for max value in u or v for log scaling
     tau : float
         thershold for max value in u or v for log scaling
-    warmstart : tible of vectors
+    warmstart : tuple of vectors
         if given then sarting values for alpha an beta log scalings
     numItermax : int, optional
         Max number of iterations
@@ -850,10 +850,9 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -862,12 +861,12 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.bregman.sinkhorn_epsilon_scaling(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.bregman.sinkhorn_epsilon_scaling(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -989,7 +988,7 @@ def projC(gamma, q):
 
 def barycenter(A, M, reg, weights=None, numItermax=1000,
                stopThr=1e-4, verbose=False, log=False):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
 
      The function solves the following optimization problem:
 
@@ -1006,13 +1005,13 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
     Parameters
     ----------
-    A : np.ndarray (d,n)
+    A : ndarray, shape (d,n)
         n training distributions a_i of size d
-    M : np.ndarray (d,d)
+    M : ndarray, shape (d,d)
         loss matrix   for OT
     reg : float
         Regularization term >0
-    weights : np.ndarray (n,)
+    weights : ndarray, shape (n,)
         Weights of each histogram a_i on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
@@ -1084,7 +1083,7 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
 
 def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1e-9, stabThr=1e-30, verbose=False, log=False):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
     where A is a collection of 2D images.
 
      The function solves the following optimization problem:
@@ -1102,11 +1101,11 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
 
     Parameters
     ----------
-    A : np.ndarray (n,w,h)
+    A : ndarray, shape (n, w, h)
         n distributions (2D images) of size w x h
     reg : float
         Regularization term >0
-    weights : np.ndarray (n,)
+    weights : ndarray, shape (n,)
         Weights of each image on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
@@ -1119,15 +1118,13 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    a : (w,h) ndarray
+    a : ndarray, shape (w, h)
         2D Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
 
@@ -1195,7 +1192,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
 
 def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
           stopThr=1e-3, verbose=False, log=False):
-    """
+    r"""
     Compute the unmixing of an observation with a given dictionary using Wasserstein distance
 
     The function solve the following optimization problem:
@@ -1217,15 +1214,15 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (d)
+    a : ndarray, shape (d)
         observed distribution
-    D : np.ndarray (d,n)
+    D : ndarray, shape (d, n)
         dictionary matrix
-    M : np.ndarray (d,d)
+    M : ndarray, shape (d, d)
         loss matrix
-    M0 : np.ndarray (n,n)
+    M0 : ndarray, shape (n, n)
         loss matrix
-    h0 : np.ndarray (n,)
+    h0 : ndarray, shape (n,)
         prior on h
     reg : float
         Regularization term >0 (Wasserstein data fitting)
@@ -1245,7 +1242,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Returns
     -------
-    a : (d,) ndarray
+    a : ndarray, shape (d,)
         Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
@@ -1302,7 +1299,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
 
 def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
-    '''
+    r'''
     Solve the entropic regularization optimal transport problem and return the
     OT matrix from empirical data
 
@@ -1325,15 +1322,15 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1347,7 +1344,7 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -1360,10 +1357,9 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
     >>> reg = 0.1
     >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
     >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
-    >>> emp_sinkhorn = empirical_sinkhorn(X_s, X_t, reg, verbose=False)
-    >>> print(emp_sinkhorn)
-    >>> [[4.99977301e-01 2.26989344e-05]
-        [2.26989344e-05 4.99977301e-01]]
+    >>> empirical_sinkhorn(X_s, X_t, reg, verbose=False)  # doctest: +NORMALIZE_WHITESPACE
+    array([[4.99977301e-01,  2.26989344e-05],
+           [2.26989344e-05,  4.99977301e-01]])
 
 
     References
@@ -1392,7 +1388,7 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI
 
 
 def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
-    '''
+    r'''
     Solve the entropic regularization optimal transport problem from empirical
     data and return the OT loss
 
@@ -1416,15 +1412,15 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1438,7 +1434,7 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -1451,9 +1447,8 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
     >>> reg = 0.1
     >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
     >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
-    >>> loss_sinkhorn = empirical_sinkhorn2(X_s, X_t, reg, verbose=False)
-    >>> print(loss_sinkhorn)
-    >>> [4.53978687e-05]
+    >>> empirical_sinkhorn2(X_s, X_t, reg, verbose=False)
+    array([4.53978687e-05])
 
 
     References
@@ -1482,7 +1477,7 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
 
 def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
-    '''
+    r'''
     Compute the sinkhorn divergence loss from empirical data
 
     The function solves the following optimization problems and return the
@@ -1525,15 +1520,15 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
 
     Parameters
     ----------
-    X_s : np.ndarray (ns, d)
+    X_s : ndarray, shape (ns, d)
         samples in the source domain
-    X_t : np.ndarray (nt, d)
+    X_t : ndarray, shape (nt, d)
         samples in the target domain
     reg : float
         Regularization term >0
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples weights in the target domain
     numItermax : int, optional
         Max number of iterations
@@ -1544,30 +1539,26 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Regularized optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
     >>> n_s = 2
     >>> n_t = 4
     >>> reg = 0.1
     >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
     >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
-    >>> emp_sinkhorn_div = empirical_sinkhorn_divergence(X_s, X_t, reg)
-    >>> print(emp_sinkhorn_div)
-    >>> [2.99977435]
+    >>> empirical_sinkhorn_divergence(X_s, X_t, reg)  # doctest: +ELLIPSIS
+    array([1.499...])
 
 
     References
     ----------
-
     .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences,  Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018
     '''
     if log:
diff --git a/ot/da.py b/ot/da.py
index 479e698..83f9027 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -41,15 +41,15 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term
-        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term
+    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e
+      (\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regularization term
       :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
       where  :math:`\mathcal{I}_c` are the index of samples from class c
       in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional
+    The algorithm used for solving the problem is the generalized conditional
     gradient as proposed in  [5]_ [7]_
 
 
diff --git a/ot/datasets.py b/ot/datasets.py
index e76e75d..ba0cfd9 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -17,7 +17,6 @@ def make_1D_gauss(n, m, s):
 
     Parameters
     ----------
-
     n : int
         number of bins in the histogram
     m : float
@@ -25,12 +24,10 @@ def make_1D_gauss(n, m, s):
     s : float
         standard deviaton of the gaussian distribution
 
-
     Returns
     -------
-    h : np.array (n,)
-          1D histogram for a gaussian distribution
-
+    h : ndarray (n,)
+        1D histogram for a gaussian distribution
     """
     x = np.arange(n, dtype=np.float64)
     h = np.exp(-(x - m)**2 / (2 * s**2))
@@ -44,16 +41,15 @@ def get_1D_gauss(n, m, sigma):
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
-    """return n samples drawn from 2D gaussian N(m,sigma)
+    """Return n samples drawn from 2D gaussian N(m,sigma)
 
     Parameters
     ----------
-
     n : int
         number of samples to make
-    m : np.array (2,)
+    m : ndarray, shape (2,)
         mean value of the gaussian distribution
-    sigma : np.array (2,2)
+    sigma : ndarray, shape (2, 2)
         covariance matrix of the gaussian distribution
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -63,9 +59,8 @@ def make_2D_samples_gauss(n, m, sigma, random_state=None):
 
     Returns
     -------
-    X : np.array (n,2)
-          n samples drawn from  N(m,sigma)
-
+    X : ndarray, shape (n, 2)
+        n samples drawn from N(m, sigma).
     """
 
     generator = check_random_state(random_state)
@@ -86,11 +81,10 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None):
 
 
 def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
-    """ dataset generation for classification problems
+    """Dataset generation for classification problems
 
     Parameters
     ----------
-
     dataset : str
         type of classification problem (see code)
     n : int
@@ -105,13 +99,11 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
 
     Returns
     -------
-    X : np.array (n,d)
-          n observation of size d
-    y : np.array (n,)
-          labels of the samples
-
+    X : ndarray, shape (n, d)
+        n observation of size d
+    y : ndarray, shape (n,)
+        labels of the samples.
     """
-
     generator = check_random_state(random_state)
 
     if dataset.lower() == '3gauss':
diff --git a/ot/dr.py b/ot/dr.py
index d30ab30..680dabf 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -1,6 +1,12 @@
 # -*- coding: utf-8 -*-
 """
 Dimension reduction with optimal transport
+
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.dr`
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -43,30 +49,25 @@ def split_classes(X, y):
 
 
 def fda(X, y, p=2, reg=1e-16):
-    """
-    Fisher Discriminant Analysis
-
+    """Fisher Discriminant Analysis
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (ridge regularization)
 
-
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
+    proj : callable
         projection function including mean centering
-
-
     """
 
     mx = np.mean(X)
@@ -124,37 +125,33 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None):
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (entropic regularization)
-    solver : str, optional
-        None for steepest decsent or 'TrustRegions' for trust regions algorithm
-        else shoudl be a pymanopt.solvers
-    P0 : numpy.ndarray (d,p)
-        Initial starting point for projection
+    solver : None | str, optional
+        None for steepest descent or 'TrustRegions' for trust regions algorithm
+        else should be a pymanopt.solvers
+    P0 : ndarray, shape (d, p)
+        Initial starting point for projection.
     verbose : int, optional
-        Print information along iterations
-
-
+        Print information along iterations.
 
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
-        projection function including mean centering
-
+    proj : callable
+        Projection function including mean centering.
 
     References
     ----------
-
-    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016). Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
-
+    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016).
+            Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
     """  # noqa
 
     mx = np.mean(X)
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index deda6b1..1ab95bb 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -5,11 +5,15 @@ This module provides GPU implementation for several OT solvers and utility
 functions. The GPU backend in handled by `cupy 
 <https://cupy.chainer.org/>`_.
 
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to 
+    use it you need to explicitely import :mod:`ot.gpu` .
+
 By default, the functions in this module accept and return numpy arrays 
 in order to proide drop-in replacement for the other POT function but
 the transfer between CPU en GPU comes with a significant overhead.
 
-In order to get the best erformances, we recommend to give only cupy 
+In order to get the best performances, we recommend to give only cupy 
 arrays to the functions and desactivate the conversion to numpy of the 
 result of the function with parameter ``to_numpy=False``.
 
diff --git a/ot/gpu/bregman.py b/ot/gpu/bregman.py
index 978b307..2e2df83 100644
--- a/ot/gpu/bregman.py
+++ b/ot/gpu/bregman.py
@@ -70,17 +70,6 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
     log : dict
         log dictionary return only if log==True in parameters
 
-    Examples
-    --------
-
-    >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
-
 
     References
     ----------
diff --git a/ot/gromov.py b/ot/gromov.py
index ca96b31..699ae4c 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -1,9 +1,6 @@
-
 # -*- coding: utf-8 -*-
 """
 Gromov-Wasserstein transport method
-
-
 """
 
 # Author: Erwan Vautier <erwan.vautier@gmail.com>
@@ -22,7 +19,7 @@ from .optim import cg
 
 
 def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
-    """ Return loss matrices and tensors for Gromov-Wasserstein fast computation
+    """Return loss matrices and tensors for Gromov-Wasserstein fast computation
 
     Returns the value of \mathcal{L}(C1,C2) \otimes T with the selected loss
     function as the loss function of Gromow-Wasserstein discrepancy.
@@ -51,29 +48,27 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     T :  ndarray, shape (ns, nt)
-         Coupling between source and target spaces
+        Coupling between source and target spaces
     p : ndarray, shape (ns,)
 
-
     Returns
     -------
-
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -114,31 +109,29 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
 
 
 def tensor_product(constC, hC1, hC2, T):
-    """ Return the tensor for Gromov-Wasserstein fast computation
+    """Return the tensor for Gromov-Wasserstein fast computation
 
     The tensor is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
-
+        h2(C) matrix in Eq. (6)
 
     Returns
     -------
-
     tens : ndarray, shape (ns, nt)
-           \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
+        \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     A = -np.dot(hC1, T).dot(hC2.T)
@@ -148,32 +141,31 @@ def tensor_product(constC, hC1, hC2, T):
 
 
 def gwloss(constC, hC1, hC2, T):
-    """ Return the Loss for Gromov-Wasserstein
+    """Return the Loss for Gromov-Wasserstein
 
     The loss is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     loss : float
-           Gromov Wasserstein loss
+        Gromov Wasserstein loss
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -183,32 +175,31 @@ def gwloss(constC, hC1, hC2, T):
 
 
 def gwggrad(constC, hC1, hC2, T):
-    """ Return the gradient for Gromov-Wasserstein
+    """Return the gradient for Gromov-Wasserstein
 
     The gradient is computed as described in Proposition 2 in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     grad : ndarray, shape (ns, nt)
            Gromov Wasserstein gradient
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     return 2 * tensor_product(constC, hC1, hC2,
@@ -222,19 +213,19 @@ def update_square_loss(p, lambdas, T, Cs):
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
     lambdas : list of float
-              list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (nt,nt)
-        updated C matrix
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
     """
     tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
                   for s in range(len(T))])
@@ -251,12 +242,12 @@ def update_kl_loss(p, lambdas, T, Cs):
     Parameters
     ----------
     p  : ndarray, shape (N,)
-         weights in the targeted barycenter
+        Weights in the targeted barycenter.
     lambdas : list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
@@ -277,27 +268,27 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
-    q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str
         loss function used for the solver either 'square_loss' or 'kl_loss'
 
     max_iter : int, optional
@@ -312,15 +303,15 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        parameters can be directly passed to the ot.optim.cg solver
 
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
+        Doupling between the two spaces that minimizes:
             \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
     log : dict
-        convergence information and loss
+        Convergence information and loss.
 
     References
     ----------
@@ -355,31 +346,37 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
 def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     """
     Computes the FGW transport between two graphs see [24]
+
     .. math::
-        \gamma = arg\min_\gamma (1-\alpha)*<\gamma,M>_F + alpha* \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \gamma = arg\min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
         s.t. \gamma 1 = p
              \gamma^T 1= q
              \gamma\geq 0
+
     where :
     - M is the (ns,nt) metric cost matrix
     - :math:`f` is the regularization term ( and df is its gradient)
     - a and b are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
-    The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
+    The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
+
     Parameters
     ----------
-    M  : ndarray, shape (ns, nt)
-         Metric cost matrix between features across domains
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix respresentative of the structure in the source space
+        Metric cost matrix representative of the structure in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix espresentative of the structure in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
-    q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string,optional
-        loss function used for the solver
+        Metric cost matrix representative of the structure in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str, optional
+        Loss function used for the solver
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -392,19 +389,21 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        parameters can be directly passed to the ot.optim.cg solver
+
     Returns
     -------
-    gamma : (ns x nt) ndarray
-        Optimal transportation matrix for the given parameters
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
     log : dict
-        log dictionary return only if log==True in parameters
+        Log dictionary return only if log==True in parameters.
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
-        "Optimal Transport for structured data with application on graphs"
-        International Conference on Machine Learning (ICML). 2019.
+        and Courty Nicolas "Optimal Transport for structured data with
+        application on graphs", International Conference on Machine Learning
+        (ICML). 2019.
     """
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
@@ -428,31 +427,37 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     """
     Computes the FGW distance between two graphs see [24]
+
     .. math::
-        \gamma = arg\min_\gamma (1-\alpha)*<\gamma,M>_F + alpha* \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+
         s.t. \gamma 1 = p
              \gamma^T 1= q
              \gamma\geq 0
+
     where :
     - M is the (ns,nt) metric cost matrix
     - :math:`f` is the regularization term ( and df is its gradient)
     - a and b are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
     The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
     Parameters
     ----------
-    M  : ndarray, shape (ns, nt)
-         Metric cost matrix between features across domains
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix respresentative of the structure in the source space
+        Metric cost matrix respresentative of the structure in the source space.
     C2 : ndarray, shape (nt, nt)
-         Metric cost matrix espresentative of the structure in the target space
+        Metric cost matrix espresentative of the structure in the target space.
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space.
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string,optional
-        loss function used for the solver
+        Distribution in the target space.
+    loss_fun : str, optional
+        Loss function used for the solver.
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -460,22 +465,24 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the steps of the line-search is found via an armijo research.
+        Else closed form is used. If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        Parameters can be directly pased to the ot.optim.cg solver.
+
     Returns
     -------
-    gamma : (ns x nt) ndarray
-        Optimal transportation matrix for the given parameters
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
     log : dict
-        log dictionary return only if log==True in parameters
+        Log dictionary return only if log==True in parameters.
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+        and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
@@ -506,29 +513,28 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
+        Metric cost matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space.
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Distribution in the target space.
+    loss_fun :  str
         loss function used for the solver either 'square_loss' or 'kl_loss'
-
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -540,6 +546,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     armijo : bool, optional
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
+
     Returns
     -------
     gw_dist : float
@@ -587,56 +594,55 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
-        s.t. \GW 1 = p
+        s.t. T 1 = p
 
-             \GW^T 1= q
+             T^T 1= q
 
-             \GW\geq 0
+             T\geq 0
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
+        Distribution in the target space
     loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
-       Max number of iterations
+        Max number of iterations
     tol : float, optional
         Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
-            \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        Optimal coupling between the two spaces
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -695,28 +701,28 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Distribution in the target space
+    loss_fun : str
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -726,7 +732,7 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
@@ -736,11 +742,10 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
-
     gw, logv = entropic_gromov_wasserstein(
         C1, C2, p, q, loss_fun, epsilon, max_iter, tol, verbose, log=True)
 
@@ -762,29 +767,31 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        C = argmin_C\in R^{NxN} \sum_s \lambda_s GW(C,Cs,p,ps)
+        C = argmin_{C\in R^{NxN}} \sum_s \lambda_s GW(C,C_s,p,p_s)
 
 
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - :math:`C_s` : metric cost matrix
+    - :math:`p_s`  : distribution
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns,ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape(N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
-    loss_fun :  tensor-matrix multiplication function based on specific loss function
-    update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-             with the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    loss_fun : callable
+        Tensor-matrix multiplication function based on specific loss function.
+    update : callable
+        function(p,lambdas,T,Cs) that updates C according to a specific Kernel
+        with the S Ts couplings calculated at each iteration
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -792,11 +799,11 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     tol : float, optional
         Stop threshol on error (>0)
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape (N, N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -806,9 +813,8 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
-
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
     """
 
     S = len(Cs)
@@ -818,6 +824,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX use random state
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -829,7 +836,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     error = []
 
-    while(err > tol and cpt < max_iter):
+    while (err > tol) and (cpt < max_iter):
         Cprev = C
 
         T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
@@ -873,37 +880,36 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     .. math::
         C = argmin_C\in R^NxN \sum_s \lambda_s GW(C,Cs,p,ps)
 
-
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - Cs : metric cost matrix
+    - ps  : distribution
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns, ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape (N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     loss_fun :  tensor-matrix multiplication function based on specific loss function
     update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
              with the S Ts couplings calculated at each iteration
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on error (>0).
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape(N,N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -913,11 +919,10 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
-
     S = len(Cs)
 
     Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
@@ -925,6 +930,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX : should use a random state and not use the global seed
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -970,47 +976,52 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
                     p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
                     verbose=False, log=False, init_C=None, init_X=None):
-    """
-    Compute the fgw barycenter as presented eq (5) in [24].
+    """Compute the fgw barycenter as presented eq (5) in [24].
+
+    Parameters
     ----------
     N : integer
         Desired number of samples of the target barycenter
     Ys: list of ndarray, each element has shape (ns,d)
         Features of all samples
     Cs : list of ndarray, each element has shape (ns,ns)
-         Structure matrices of all samples
+        Structure matrices of all samples
     ps : list of ndarray, each element has shape (ns,)
-        masses of all samples
+        Masses of all samples.
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     alpha : float
-            Alpha parameter for the fgw distance
-    fixed_structure :  bool
-                       Wether to fix the structure of the barycenter during the updates
-    fixed_features :  bool
-                       Wether to fix the feature of the barycenter during the updates
-    init_C :  ndarray, shape (N,N), optional
-              initialization for the barycenters' structure matrix. If not set random init
-    init_X :  ndarray, shape (N,d), optional
-              initialization for the barycenters' features. If not set random init
+        Alpha parameter for the fgw distance
+    fixed_structure : bool
+        Whether to fix the structure of the barycenter during the updates
+    fixed_features : bool
+        Whether to fix the feature of the barycenter during the updates
+    init_C : ndarray, shape (N,N), optional
+        Initialization for the barycenters' structure matrix. If not set
+        a random init is used.
+    init_X : ndarray, shape (N,d), optional
+        Initialization for the barycenters' features. If not set a
+        random init is used.
+
     Returns
-    ----------
-    X : ndarray, shape (N,d)
+    -------
+    X : ndarray, shape (N, d)
         Barycenters' features
-    C : ndarray, shape (N,N)
+    C : ndarray, shape (N, N)
         Barycenters' structure matrix
-    log_: dictionary
-        Only returned when log=True
+    log_: dict
+        Only returned when log=True. It contains the keys:
         T : list of (N,ns) transport matrices
-        Ms : all distance matrices between the feature of the barycenter and the other features dist(X,Ys) shape (N,ns)
+        Ms : all distance matrices between the feature of the barycenter and the
+        other features dist(X,Ys) shape (N,ns)
+
     References
     ----------
     .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+        and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
-
     S = len(Cs)
     d = Ys[0].shape[1]  # dimension on the node features
     if p is None:
@@ -1073,7 +1084,8 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
                 T_temp = [t.T for t in T]
                 C = update_sructure_matrix(p, lambdas, T_temp, Cs)
 
-        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha, numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
+        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
+                                      numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
 
         # T is N,ns
         err_feature = np.linalg.norm(X - Xprev.reshape(N, d))
@@ -1092,6 +1104,7 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             print('{:5d}|{:8e}|'.format(cpt, err_feature))
 
         cpt += 1
+
     if log:
         log_['T'] = T  # from target to Ys
         log_['p'] = p
@@ -1104,23 +1117,25 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
 
 
 def update_sructure_matrix(p, lambdas, T, Cs):
-    """
-    Updates C according to the L2 Loss kernel with the S Ts couplings
-    calculated at each iteration
+    """Updates C according to the L2 Loss kernel with the S Ts couplings.
+
+    It is calculated at each iteration
+
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
     lambdas : list of float
-              list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
-    Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        List of the S spaces' weights.
+    T : list of S ndarray of shape (ns, N)
+        The S Ts couplings calculated at each iteration.
+    Cs : list of S ndarray, shape (ns, ns)
+         Metric cost matrices.
+
     Returns
-    ----------
-    C : ndarray, shape (nt,nt)
-        updated C matrix
+    -------
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
     """
     tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s]) for s in range(len(T))])
     ppt = np.outer(p, p)
@@ -1129,22 +1144,26 @@ def update_sructure_matrix(p, lambdas, T, Cs):
 
 
 def update_feature_matrix(lambdas, Ys, Ts, p):
-    """
-    Updates the feature with respect to the S Ts couplings. See "Solving the barycenter problem with Block Coordinate Descent (BCD)" in [24]
-    calculated at each iteration
+    """Updates the feature with respect to the S Ts couplings.
+
+
+    See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
+    in [24] calculated at each iteration
+
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        masses in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     Ts : list of S np.ndarray(ns,N)
         the S Ts couplings calculated at each iteration
     Ys : list of S ndarray, shape(d,ns)
-         The features
+        The features.
+
     Returns
-    ----------
-    X : ndarray, shape (d,N)
+    -------
+    X : ndarray, shape (d, N)
 
     References
     ----------
@@ -1153,9 +1172,8 @@ def update_feature_matrix(lambdas, Ys, Ts, p):
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
+    p = np.array(1. / p).reshape(-1,)
 
-    p = np.diag(np.array(1 / p).reshape(-1,))
-
-    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T).dot(p) for s in range(len(Ts))])
+    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts))])
 
     return tmpsum
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 02cbd8c..17f1731 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
 Solvers for the original linear program OT problem
+
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -10,20 +13,22 @@ Solvers for the original linear program OT problem
 import multiprocessing
 
 import numpy as np
+from scipy.sparse import coo_matrix
 
 from .import cvx
 
 # import compiled emd
-from .emd_wrap import emd_c, check_result
+from .emd_wrap import emd_c, check_result, emd_1d_sorted
 from ..utils import parmap
 from .cvx import barycenter
 from ..utils import dist
 
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
+__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+         'emd_1d', 'emd2_1d', 'wasserstein_1d']
 
 
 def emd(a, b, M, numItermax=100000, log=False):
-    """Solves the Earth Movers distance problem and returns the OT matrix
+    r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
     .. math::
@@ -37,26 +42,30 @@ def emd(a, b, M, numItermax=100000, log=False):
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
-    log: boolean, optional (default=False)
+    log: bool, optional (default=False)
         If True, returns a dictionary containing the cost and dual
         variables. Otherwise returns only the optimal transportation matrix.
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
     log: dict
         If input log is true, a dictionary containing the cost and dual
@@ -74,8 +83,8 @@ def emd(a, b, M, numItermax=100000, log=False):
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.emd(a,b,M)
-    array([[ 0.5,  0. ],
-           [ 0. ,  0.5]])
+    array([[0.5, 0. ],
+           [0. , 0.5]])
 
     References
     ----------
@@ -94,7 +103,7 @@ def emd(a, b, M, numItermax=100000, log=False):
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -115,7 +124,7 @@ def emd(a, b, M, numItermax=100000, log=False):
 
 def emd2(a, b, M, processes=multiprocessing.cpu_count(),
          numItermax=100000, log=False, return_matrix=False):
-    """Solves the Earth Movers distance problem and returns the loss
+    r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F
@@ -128,16 +137,20 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
@@ -151,7 +164,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
-    log: dict
+    log: dictnp
         If input log is true, a dictionary containing the cost and dual
         variables and exit status
 
@@ -187,7 +200,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -231,9 +244,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     Parameters
     ----------
-    measures_locations : list of (k_i,d) np.ndarray
+    measures_locations : list of (k_i,d) numpy.ndarray
         The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
-    measures_weights : list of (k_i,) np.ndarray
+    measures_weights : list of (k_i,) numpy.ndarray
         Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
@@ -246,7 +259,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -308,4 +321,288 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         log_dict['displacement_square_norms'] = displacement_square_norms
         return X, log_dict
     else:
-        return X
-\ No newline at end of file
+        return X
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+           log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the OT matrix
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the cost.
+        Otherwise returns only the optimal transportation matrix.
+
+    Returns
+    -------
+    gamma: (ns, nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log: dict
+        If input log is True, a dictionary containing the cost
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd_1d(x_a, x_b, a, b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+    >>> ot.emd_1d(x_a, x_b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd : EMD for multidimensional distributions
+    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+        transportation matrix)
+    """
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    x_a = np.asarray(x_a, dtype=np.float64)
+    x_b = np.asarray(x_b, dtype=np.float64)
+
+    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+
+    # if empty array given then use uniform distributions
+    if a.ndim == 0 or len(a) == 0:
+        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
+    if b.ndim == 0 or len(b) == 0:
+        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+
+    x_a_1d = x_a.reshape((-1, ))
+    x_b_1d = x_b.reshape((-1, ))
+    perm_a = np.argsort(x_a_1d)
+    perm_b = np.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(a, b,
+                                            x_a_1d[perm_a], x_b_1d[perm_b],
+                                            metric=metric, p=p)
+    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
+                   shape=(a.shape[0], b.shape[0]))
+    if dense:
+        G = G.toarray()
+    if log:
+        log = {'cost': cost}
+        return G, log
+    return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+            log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the loss
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Only used if log is set to True. Due to implementation details,
+        this function runs faster when dense is set to False.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the transportation matrix.
+        Otherwise returns only the loss.
+
+    Returns
+    -------
+    loss: float
+        Cost associated to the optimal transportation
+    log: dict
+        If input log is True, a dictionary containing the Optimal transportation
+        matrix for the given parameters
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd2_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd2_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.emd2_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd2 : EMD for multidimensional distributions
+    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+        instead of the cost)
+    """
+    # If we do not return G (log==False), then we should not to cast it to dense
+    # (useless overhead)
+    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+                        dense=dense and log, log=True)
+    cost = log_emd['cost']
+    if log:
+        log_emd = {'G': G}
+        return cost, log_emd
+    return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
+    r"""Solves the p-Wasserstein distance problem between 1d measures and returns
+    the distance
+
+    .. math::
+        \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+
+    where :
+
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    p: float, optional (default=1.0)
+         The order of the p-Wasserstein distance to be computed
+
+    Returns
+    -------
+    dist: float
+        p-Wasserstein distance
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function wasserstein_1d accepts
+    lists and performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.wasserstein_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.wasserstein_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd_1d : EMD for 1d distributions
+    """
+    cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+                       dense=False, log=False)
+    return np.power(cost_emd, 1. / p)
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 83ee6aa..2b6c495 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -10,7 +10,10 @@ Cython linker with C solver
 import numpy as np
 cimport numpy as np
 
+from ..utils import dist
+
 cimport cython
+cimport libc.math as math
 
 import warnings
 
@@ -55,13 +58,16 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order :py.cls:`numpy.array`
+
     Parameters
     ----------
-    a : (ns,) ndarray, float64
+    a : (ns,) numpy.ndarray, float64
         source histogram
-    b : (nt,) ndarray, float64
+    b : (nt,) numpy.ndarray, float64
         target histogram
-    M : (ns,nt) ndarray, float64
+    M : (ns,nt) numpy.ndarray, float64
         loss matrix
     max_iter : int
         The maximum number of iterations before stopping the optimization
@@ -70,7 +76,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
 
     """
@@ -93,3 +99,89 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
 
     return G, cost, alpha, beta, result_code
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
+                  np.ndarray[double, ndim=1, mode="c"] v_weights,
+                  np.ndarray[double, ndim=1, mode="c"] u,
+                  np.ndarray[double, ndim=1, mode="c"] v,
+                  str metric='sqeuclidean',
+                  double p=1.):
+    r"""
+    Solves the Earth Movers distance problem between sorted 1d measures and
+    returns the OT matrix and the associated cost
+
+    Parameters
+    ----------
+    u_weights : (ns,) ndarray, float64
+        Source histogram
+    v_weights : (nt,) ndarray, float64
+        Target histogram
+    u : (ns,) ndarray, float64
+        Source dirac locations (on the real line)
+    v : (nt,) ndarray, float64
+        Target dirac locations (on the real line)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+
+    Returns
+    -------
+    gamma: (n, ) ndarray, float64
+        Values in the Optimal transportation matrix
+    indices: (n, 2) ndarray, int64
+        Indices of the values stored in gamma for the Optimal transportation
+        matrix
+    cost
+        cost associated to the optimal transportation
+    """
+    cdef double cost = 0.
+    cdef int n = u_weights.shape[0]
+    cdef int m = v_weights.shape[0]
+
+    cdef int i = 0
+    cdef double w_i = u_weights[0]
+    cdef int j = 0
+    cdef double w_j = v_weights[0]
+
+    cdef double m_ij = 0.
+
+    cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ),
+                                                           dtype=np.float64)
+    cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
+                                                              dtype=np.int)
+    cdef int cur_idx = 0
+    while i < n and j < m:
+        if metric == 'sqeuclidean':
+            m_ij = (u[i] - v[j]) * (u[i] - v[j])
+        elif metric == 'cityblock' or metric == 'euclidean':
+            m_ij = math.fabs(u[i] - v[j])
+        elif metric == 'minkowski':
+            m_ij = math.pow(math.fabs(u[i] - v[j]), p)
+        else:
+            m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)),
+                        metric=metric)[0, 0]
+        if w_i < w_j or j == m - 1:
+            cost += m_ij * w_i
+            G[cur_idx] = w_i
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            i += 1
+            w_j -= w_i
+            w_i = u_weights[i]
+        else:
+            cost += m_ij * w_j
+            G[cur_idx] = w_j
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            j += 1
+            w_i -= w_j
+            w_j = v_weights[j]
+        cur_idx += 1
+    return G[:cur_idx], indices[:cur_idx], cost
diff --git a/ot/optim.py b/ot/optim.py
index f94aceb..0abd9e9 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -26,14 +26,13 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
 
     Parameters
     ----------
-
-    f : function
+    f : callable
         loss function
-    xk : np.ndarray
+    xk : ndarray
         initial position
-    pk : np.ndarray
+    pk : ndarray
         descent direction
-    gfk : np.ndarray
+    gfk : ndarray
         gradient of f at xk
     old_fval : float
         loss value at xk
@@ -161,15 +160,15 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 :  ndarray, shape (ns,nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
@@ -299,17 +298,17 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarrayv (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg1 : float
         Entropic Regularization term >0
     reg2 : float
         Second Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 : ndarray, shape (ns, nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
@@ -326,15 +325,13 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
     .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
 
@@ -422,13 +419,12 @@ def solve_1d_linesearch_quad(a, b, c):
     Parameters
     ----------
     a,b,c : float
-            The coefficients of the quadratic function
+        The coefficients of the quadratic function
 
     Returns
     -------
     x : float
         The optimal value which leads to the minimal cost
-
     """
     f0 = c
     df0 = b
diff --git a/ot/plot.py b/ot/plot.py
index 784a372..f403e98 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -1,5 +1,11 @@
 """
 Functions for plotting OT matrices
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.plot`
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -20,11 +26,11 @@ def plot1D_mat(a, b, M, title=''):
 
     Parameters
     ----------
-    a : np.array, shape (na,)
+    a : ndarray, shape (na,)
         Source distribution
-    b : np.array, shape (nb,)
+    b : ndarray, shape (nb,)
         Target distribution
-    M : np.array, shape (na,nb)
+    M : ndarray, shape (na, nb)
         Matrix to plot
     """
     na, nb = M.shape
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 85c4230..13ed9cc 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -1,3 +1,9 @@
+"""
+Stochastic solvers for regularized OT.
+
+
+"""
+
 # Author: Kilian Fatras <kilian.fatras@gmail.com>
 #
 # License: MIT License
@@ -11,7 +17,7 @@ import numpy as np
 
 
 def coordinate_grad_semi_dual(b, M, reg, beta, i):
-    '''
+    r'''
     Compute the coordinate gradient update for regularized discrete distributions for (i, :)
 
     The function computes the gradient of the semi dual problem:
@@ -32,51 +38,49 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
-    reg : float nu
-        Regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
-    i : number int
-        picked number i
+    b : ndarray, shape (nt,)
+        Target measure.
+    M : ndarray, shape (ns, nt)
+        Cost matrix.
+    reg : float
+        Regularization term > 0.
+    v : ndarray, shape (nt,)
+        Dual variable.
+    i : int
+        Picked number i.
 
     Returns
     -------
-
-    coordinate gradient : np.ndarray(nt,)
+    coordinate gradient : ndarray, shape (nt,)
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
+
 
     References
     ----------
-
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
-
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
-
     r = M[i, :] - beta
     exp_beta = np.exp(-r / reg) * b
     khi = exp_beta / (np.sum(exp_beta))
@@ -84,7 +88,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
 
 def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
-    '''
+    r'''
     Compute the SAG algorithm to solve the regularized discrete measures
         optimal transport max problem
 
@@ -112,42 +116,43 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     Parameters
     ----------
 
-    a : np.ndarray(ns,),
-        source measure
-    b : np.ndarray(nt,),
-        target measure
-    M : np.ndarray(ns, nt),
-        cost matrix
-    reg : float number,
+    a : ndarray, shape (ns,),
+        Source measure.
+    b : ndarray, shape (nt,),
+        Target measure.
+    M : ndarray, shape (ns, nt),
+        Cost matrix.
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    v : np.ndarray(nt,)
-        dual variable
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
@@ -176,7 +181,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
 
 
 def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
-    '''
+    r'''
     Compute the ASGD algorithm to solve the regularized semi continous measures optimal transport max problem
 
     The function solves the following optimization problem:
@@ -202,50 +207,49 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
-
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    ave_v : np.ndarray(nt,)
+    ave_v : ndarray, shape (nt,)
         dual variable
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     if lr is None:
@@ -264,7 +268,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
 
 def c_transform_entropic(b, M, reg, beta):
-    '''
+    r'''
     The goal is to recover u from the c-transform.
 
     The function computes the c_transform of a dual variable from the other
@@ -285,47 +289,47 @@ def c_transform_entropic(b, M, reg, beta):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
+    b : ndarray, shape (nt,)
+        Target measure
+    M : ndarray, shape (ns, nt)
+        Cost matrix
     reg : float
-        regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
+        Regularization term > 0
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Returns
     -------
-
-    u : np.ndarray(ns,)
-        dual variable
+    u : ndarray, shape (ns,)
+        Dual variable.
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     n_source = np.shape(M)[0]
@@ -340,7 +344,7 @@ def c_transform_entropic(b, M, reg, beta):
 
 def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                 log=False):
-    '''
+    r'''
     Compute the transportation matrix to solve the regularized discrete
         measures optimal transport max problem
 
@@ -367,52 +371,53 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
     Parameters
     ----------
 
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
     methode : str
         used method (SAG or ASGD)
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
-    n_source : int number
+    n_source : int
         size of the source measure
-    n_target : int number
+    n_target : int
         size of the target measure
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
@@ -451,7 +456,7 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
 
 def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
                     batch_beta):
-    '''
+    r'''
     Computes the partial gradient of the dual optimal transport problem.
 
     For each (i,j) in a batch of coordinates, the partial gradients are :
@@ -478,53 +483,55 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
-    batch_size : int number
+    batch_size : int
         size of the batch
-    batch_alpha : np.ndarray(bs,)
+    batch_alpha : ndarray, shape (bs,)
         batch of index of alpha
-    batch_beta : np.ndarray(bs,)
+    batch_beta : ndarray, shape (bs,)
         batch of index of beta
 
     Returns
     -------
-
-    grad : np.ndarray(ns,)
+    grad : ndarray, shape (ns,)
         partial grad F
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 20000
-    >>> lr = 0.1
-    >>> batch_size = 3
-    >>> log = True
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                            batch_size,
-                                                            numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg=1, batch_size=3, numItermax=30000, lr=0.1, log=True)
+    >>> log['alpha']
+    array([0.71759102, 1.57057384, 0.85576566, 0.1208211 , 0.59190466,
+           1.197148  , 0.17805133])
+    >>> log['beta']
+    array([0.49741367, 0.57478564, 1.40075528, 2.75890102])
+    >>> sgd_dual_pi
+    array([[2.09730063e-02, 8.38169324e-02, 7.50365455e-03, 8.72731415e-09],
+           [5.58432437e-03, 5.89881299e-04, 3.09558411e-05, 8.35469849e-07],
+           [3.26489515e-03, 7.15536035e-02, 2.99778211e-02, 3.02601593e-10],
+           [4.05390622e-02, 5.31085068e-02, 6.65191787e-02, 1.55812785e-06],
+           [7.82299812e-02, 6.12099102e-03, 4.44989098e-02, 2.37719187e-03],
+           [5.06266486e-02, 2.16230494e-03, 2.26215141e-03, 6.81514609e-04],
+           [6.06713990e-02, 3.98139808e-02, 5.46829338e-02, 8.62371424e-06]])
 
     References
     ----------
@@ -533,7 +540,6 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
                     International Conference on Learning Representation (2018),
                       arXiv preprint arxiv:1711.02283.
     '''
-
     G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
                    M[batch_alpha, :][:, batch_beta]) / reg) *
            a[batch_alpha, None] * b[None, batch_beta])
@@ -548,7 +554,7 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
 
 def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
-    '''
+    r'''
     Compute the sgd algorithm to solve the regularized discrete measures
         optimal transport dual problem
 
@@ -571,33 +577,31 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
 
     Returns
     -------
-
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
 
     Examples
     --------
-
+    >>> import ot
     >>> n_source = 7
     >>> n_target = 4
     >>> reg = 1
@@ -611,18 +615,26 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
     >>> X_source = rng.randn(n_source, 2)
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                          batch_size,
-                                                          numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
+    >>> log['alpha']
+    array([0.64171798, 1.27932201, 0.78132257, 0.15638935, 0.54888354,
+           1.03663469, 0.20595781])
+    >>> log['beta']
+    array([0.51207194, 0.58033189, 1.28922676, 2.26859736])
+    >>> sgd_dual_pi
+    array([[1.97276541e-02, 7.81248547e-02, 6.22136048e-03, 4.95442423e-09],
+           [4.23494310e-03, 4.43286263e-04, 2.06927079e-05, 3.82389139e-07],
+           [3.07542414e-03, 6.67897769e-02, 2.48904999e-02, 1.72030247e-10],
+           [4.26271990e-02, 5.53375455e-02, 6.16535024e-02, 9.88812650e-07],
+           [7.60423265e-02, 5.89585256e-03, 3.81267087e-02, 1.39458256e-03],
+           [4.37557504e-02, 1.85189176e-03, 1.72335760e-03, 3.55491279e-04],
+           [6.33096109e-02, 4.11683954e-02, 5.02962051e-02, 5.43097516e-06]])
 
     References
     ----------
-
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     n_source = np.shape(M)[0]
@@ -644,7 +656,7 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
 def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
                         log=False):
-    '''
+    r'''
     Compute the transportation matrix to solve the regularized discrete measures
         optimal transport dual problem
 
@@ -667,35 +679,33 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
+    >>> import ot
     >>> n_source = 7
     >>> n_target = 4
     >>> reg = 1
@@ -709,18 +719,27 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
     >>> X_source = rng.randn(n_source, 2)
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                            batch_size,
-                                                            numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
+    >>> log['alpha']
+    array([0.64057733, 1.2683513 , 0.75610161, 0.16024284, 0.54926534,
+           1.0514201 , 0.19958936])
+    >>> log['beta']
+    array([0.51372571, 0.58843489, 1.27993921, 2.24344807])
+    >>> sgd_dual_pi
+    array([[1.97377795e-02, 7.86706853e-02, 6.15682001e-03, 4.82586997e-09],
+           [4.19566963e-03, 4.42016865e-04, 2.02777272e-05, 3.68823708e-07],
+           [3.00379244e-03, 6.56562018e-02, 2.40462171e-02, 1.63579656e-10],
+           [4.28626062e-02, 5.60031599e-02, 6.13193826e-02, 9.67977735e-07],
+           [7.61972739e-02, 5.94609051e-03, 3.77886693e-02, 1.36046648e-03],
+           [4.44810042e-02, 1.89476742e-03, 1.73285847e-03, 3.51826036e-04],
+           [6.30118293e-02, 4.12398660e-02, 4.95148998e-02, 5.26247246e-06]])
 
     References
     ----------
 
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
new file mode 100644
index 0000000..467fda2
--- /dev/null
+++ b/ot/unbalanced.py
@@ -0,0 +1,519 @@
+# -*- coding: utf-8 -*-
+"""
+Regularized Unbalanced OT
+"""
+
+# Author: Hicham Janati <hicham.janati@inria.fr>
+# License: MIT License
+
+from __future__ import division
+import warnings
+import numpy as np
+# from .utils import unif, dist
+
+
+def sinkhorn_unbalanced(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000,
+                        stopThr=1e-9, verbose=False, log=False, **kwargs):
+    r"""
+    Solve the unbalanced entropic regularization optimal transport problem and return the loss
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (ns, nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt,n_hists)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns, nt)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    alpha : float
+        Marginal relaxation term > 0
+    method : str
+        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        'sinkhorn_epsilon_scaling', see those function for specific parameters
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (> 0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    W : (nt) ndarray or float
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn_unbalanced(a, b, M, 1, 1)
+    array([[0.51122823, 0.18807035],
+           [0.18807035, 0.51122823]])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. : Learning with a Wasserstein Loss,  Advances in Neural Information Processing Systems (NIPS) 2015
+
+
+    See Also
+    --------
+    ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn [10]
+    ot.unbalanced.sinkhorn_stabilized_unbalanced: Unbalanced Stabilized sinkhorn [9][10]
+    ot.unbalanced.sinkhorn_epsilon_scaling_unbalanced: Unbalanced Sinkhorn with epslilon scaling [9][10]
+
+    """
+
+    if method.lower() == 'sinkhorn':
+        def sink():
+            return sinkhorn_knopp_unbalanced(a, b, M, reg, alpha,
+                                             numItermax=numItermax,
+                                             stopThr=stopThr, verbose=verbose,
+                                             log=log, **kwargs)
+
+    elif method.lower() in ['sinkhorn_stabilized', 'sinkhorn_epsilon_scaling']:
+        warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
+
+        def sink():
+            return sinkhorn_knopp_unbalanced(a, b, M, reg, alpha,
+                                             numItermax=numItermax,
+                                             stopThr=stopThr, verbose=verbose,
+                                             log=log, **kwargs)
+    else:
+        raise ValueError('Unknown method. Using classic Sinkhorn Knopp')
+
+    return sink()
+
+
+def sinkhorn_unbalanced2(a, b, M, reg, alpha, method='sinkhorn',
+                         numItermax=1000, stopThr=1e-9, verbose=False,
+                         log=False, **kwargs):
+    r"""
+    Solve the entropic regularization unbalanced optimal transport problem and return the loss
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (ns, nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt, n_hists)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    alpha : float
+        Marginal relaxation term > 0
+    method : str
+        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        'sinkhorn_epsilon_scaling', see those function for specific parameters
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    W : (nt) ndarray or float
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .10]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.],[1., 0.]]
+    >>> ot.unbalanced.sinkhorn_unbalanced2(a, b, M, 1., 1.)
+    array([0.31912866])
+
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. : Learning with a Wasserstein Loss,  Advances in Neural Information Processing Systems (NIPS) 2015
+
+    See Also
+    --------
+    ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn [10]
+    ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn [9][10]
+    ot.unbalanced.sinkhorn_epsilon_scaling: Unbalanced Sinkhorn with epslilon scaling [9][10]
+
+    """
+
+    if method.lower() == 'sinkhorn':
+        def sink():
+            return sinkhorn_knopp_unbalanced(a, b, M, reg, alpha,
+                                             numItermax=numItermax,
+                                             stopThr=stopThr, verbose=verbose,
+                                             log=log, **kwargs)
+
+    elif method.lower() in ['sinkhorn_stabilized', 'sinkhorn_epsilon_scaling']:
+        warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
+
+        def sink():
+            return sinkhorn_knopp_unbalanced(a, b, M, reg, alpha,
+                                             numItermax=numItermax,
+                                             stopThr=stopThr, verbose=verbose,
+                                             log=log, **kwargs)
+    else:
+        raise ValueError('Unknown method. Using classic Sinkhorn Knopp')
+
+    b = np.asarray(b, dtype=np.float64)
+    if len(b.shape) < 2:
+        b = b[:, None]
+
+    return sink()
+
+
+def sinkhorn_knopp_unbalanced(a, b, M, reg, alpha, numItermax=1000,
+                              stopThr=1e-9, verbose=False, log=False, **kwargs):
+    r"""
+    Solve the entropic regularization unbalanced optimal transport problem and return the loss
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \\alpha KL(\gamma 1, a) + \\alpha KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (ns, nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt, n_hists)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    alpha : float
+        Marginal relaxation term > 0
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.],[1., 0.]]
+    >>> ot.unbalanced.sinkhorn_knopp_unbalanced(a, b, M, 1., 1.)
+    array([[0.51122823, 0.18807035],
+           [0.18807035, 0.51122823]])
+
+    References
+    ----------
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. : Learning with a Wasserstein Loss,  Advances in Neural Information Processing Systems (NIPS) 2015
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    n_a, n_b = M.shape
+
+    if len(a) == 0:
+        a = np.ones(n_a, dtype=np.float64) / n_a
+    if len(b) == 0:
+        b = np.ones(n_b, dtype=np.float64) / n_b
+
+    if len(b.shape) > 1:
+        n_hists = b.shape[1]
+    else:
+        n_hists = 0
+
+    if log:
+        log = {'err': []}
+
+    # we assume that no distances are null except those of the diagonal of
+    # distances
+    if n_hists:
+        u = np.ones((n_a, 1)) / n_a
+        v = np.ones((n_b, n_hists)) / n_b
+        a = a.reshape(n_a, 1)
+    else:
+        u = np.ones(n_a) / n_a
+        v = np.ones(n_b) / n_b
+
+    # print(reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    # print(np.min(K))
+    fi = alpha / (alpha + reg)
+
+    cpt = 0
+    err = 1.
+
+    while (err > stopThr and cpt < numItermax):
+        uprev = u
+        vprev = v
+
+        Kv = K.dot(v)
+        u = (a / Kv) ** fi
+        Ktu = K.T.dot(u)
+        v = (b / Ktu) ** fi
+
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration', cpt)
+            u = uprev
+            v = vprev
+            break
+        if cpt % 10 == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
+                np.sum((v - vprev)**2) / np.sum((v)**2)
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if cpt % 200 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+        cpt += 1
+
+    if log:
+        log['u'] = u
+        log['v'] = v
+
+    if n_hists:  # return only loss
+        res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
+        if log:
+            return res, log
+        else:
+            return res
+
+    else:  # return OT matrix
+
+        if log:
+            return u[:, None] * K * v[None, :], log
+        else:
+            return u[:, None] * K * v[None, :]
+
+
+def barycenter_unbalanced(A, M, reg, alpha, weights=None, numItermax=1000,
+                          stopThr=1e-4, verbose=False, log=False):
+    r"""Compute the entropic regularized unbalanced wasserstein barycenter of distributions A
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
+    - alpha is the marginal relaxation hyperparameter
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+
+    Parameters
+    ----------
+    A : np.ndarray (d,n)
+        n training distributions a_i of size d
+    M : np.ndarray (d,d)
+        loss matrix   for OT
+    reg : float
+        Entropy regularization term > 0
+    alpha : float
+        Marginal relaxation term > 0
+    weights : np.ndarray (n,)
+        Weights of each histogram a_i on the simplex (barycentric coodinates)
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (d,) ndarray
+        Unbalanced Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+
+
+    """
+    p, n_hists = A.shape
+    if weights is None:
+        weights = np.ones(n_hists) / n_hists
+    else:
+        assert(len(weights) == A.shape[1])
+
+    if log:
+        log = {'err': []}
+
+    K = np.exp(- M / reg)
+
+    fi = alpha / (alpha + reg)
+
+    v = np.ones((p, n_hists)) / p
+    u = np.ones((p, 1)) / p
+
+    cpt = 0
+    err = 1.
+
+    while (err > stopThr and cpt < numItermax):
+        uprev = u
+        vprev = v
+
+        Kv = K.dot(v)
+        u = (A / Kv) ** fi
+        Ktu = K.T.dot(u)
+        q = ((Ktu ** (1 - fi)).dot(weights))
+        q = q ** (1 / (1 - fi))
+        Q = q[:, None]
+        v = (Q / Ktu) ** fi
+
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % cpt)
+            u = uprev
+            v = vprev
+            break
+        if cpt % 10 == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \
+                np.sum((v - vprev) ** 2) / np.sum((v) ** 2)
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if cpt % 50 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+    cpt += 1
+    if log:
+        log['niter'] = cpt
+        log['u'] = u
+        log['v'] = v
+        return q, log
+    else:
+        return q
diff --git a/ot/utils.py b/ot/utils.py
index efd1288..8419c83 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Various function that can be usefull
+Various useful functions
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -111,12 +111,12 @@ def dist(x1, x2=None, metric='sqeuclidean'):
     Parameters
     ----------
 
-    x1 : np.array (n1,d)
+    x1 : ndarray, shape (n1,d)
         matrix with n1 samples of size d
-    x2 : np.array (n2,d), optional
+    x2 : array, shape (n2,d), optional
         matrix with n2 samples of size d (if None then x2=x1)
-    metric : str, fun, optional
-        name of the metric to be computed (full list in the doc of scipy),  If a string,
+    metric : str | callable, optional
+        Name of the metric to be computed (full list in the doc of scipy),  If a string,
         the distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
         'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski',
         'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
@@ -138,26 +138,21 @@ def dist(x1, x2=None, metric='sqeuclidean'):
 
 
 def dist0(n, method='lin_square'):
-    """Compute standard cost matrices of size (n,n) for OT problems
+    """Compute standard cost matrices of size (n, n) for OT problems
 
     Parameters
     ----------
-
     n : int
-        size of the cost matrix
+        Size of the cost matrix.
     method : str, optional
         Type of loss matrix chosen from:
 
         * 'lin_square' : linear sampling between 0 and n-1, quadratic loss
 
-
     Returns
     -------
-
-    M : np.array (n1,n2)
-        distance matrix computed with given metric
-
-
+    M : ndarray, shape (n1,n2)
+        Distance matrix computed with given metric.
     """
     res = 0
     if method == 'lin_square':
@@ -169,22 +164,18 @@ def dist0(n, method='lin_square'):
 def cost_normalization(C, norm=None):
     """ Apply normalization to the loss matrix
 
-
     Parameters
     ----------
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The cost matrix to normalize.
     norm : str
-        type of normalization from 'median','max','log','loglog'. Any other
-        value do not normalize.
-
+        Type of normalization from 'median', 'max', 'log', 'loglog'. Any
+        other value do not normalize.
 
     Returns
     -------
-
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The input cost matrix normalized according to given norm.
-
     """
 
     if norm == "median":
@@ -194,7 +185,7 @@ def cost_normalization(C, norm=None):
     elif norm == "log":
         C = np.log(1 + C)
     elif norm == "loglog":
-        C = np.log(1 + np.log(1 + C))
+        C = np.log1p(np.log1p(C))
 
     return C
 
@@ -256,6 +247,7 @@ def check_params(**kwargs):
 
 def check_random_state(seed):
     """Turn seed into a np.random.RandomState instance
+
     Parameters
     ----------
     seed : None | int | instance of RandomState
@@ -275,7 +267,6 @@ def check_random_state(seed):
 
 
 class deprecated(object):
-
     """Decorator to mark a function or class as deprecated.
 
     deprecated class from scikit-learn package
@@ -285,14 +276,14 @@ class deprecated(object):
     The optional extra argument will be appended to the deprecation message
     and the docstring. Note: to use this with the default value for extra, put
     in an empty of parentheses:
-    >>> from ot.deprecation import deprecated
-    >>> @deprecated()
-    ... def some_function(): pass
+    >>> from ot.deprecation import deprecated  # doctest: +SKIP
+    >>> @deprecated()  # doctest: +SKIP
+    ... def some_function(): pass  # doctest: +SKIP
 
     Parameters
     ----------
-    extra : string
-          to be added to the deprecation messages
+    extra : str
+        To be added to the deprecation messages.
     """
 
     # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
@@ -373,9 +364,9 @@ def _is_deprecated(func):
 
 
 class BaseEstimator(object):
-
     """Base class for most objects in POT
-    adapted from sklearn BaseEstimator class
+
+    Code adapted from sklearn BaseEstimator class
 
     Notes
     -----
@@ -417,7 +408,7 @@ class BaseEstimator(object):
 
         Parameters
         ----------
-        deep : boolean, optional
+        deep : bool, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.