19 files changed, 4259 insertions, 901 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index b74b924..89c7936 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -1,6 +1,46 @@
-"""Python Optimal Transport toolbox
+"""
+
+This is the main module of the POT toolbox. It provides easy access to
+a number of sub-modules and functions described below.
+
+.. note::
+
+
+    Here is a list of the submodules and short description of what they contain.
+
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.bregman` contains OT solvers for the entropic OT problems using
+      Bregman projections.
+    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
+    - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT
+      problems.
+    - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov
+      Wasserstein problems.
+    - :any:`ot.optim` contains generic solvers OT based optimization problems
+    - :any:`ot.da` contains classes and function related to Monge mapping
+      estimation and Domain Adaptation (DA).
+    - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers
+    - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein
+      Discriminant Analysis.
+    - :any:`ot.utils` contains utility functions such as distance computation and
+      timing.
+    - :any:`ot.datasets` contains toy dataset generation functions.
+    - :any:`ot.plot` contains visualization functions
+    - :any:`ot.stochastic` contains stochastic solvers for regularized OT.
+    - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT.
+
+.. warning::
+    The list of automatically imported sub-modules is as follows:
+    :py:mod:`ot.lp`, :py:mod:`ot.bregman`, :py:mod:`ot.optim`
+    :py:mod:`ot.utils`, :py:mod:`ot.datasets`,
+    :py:mod:`ot.gromov`, :py:mod:`ot.smooth`
+    :py:mod:`ot.stochastic`
 
+    The following sub-modules are not imported due to additional dependencies:
 
+    - :any:`ot.dr` : depends on :code:`pymanopt` and :code:`autograd`.
+    - :any:`ot.gpu` : depends on :code:`cupy` and a CUDA GPU.
+    - :any:`ot.plot` : depends on :code:`matplotlib`
 
 """
 
@@ -20,17 +60,22 @@ from . import da
 from . import gromov
 from . import smooth
 from . import stochastic
+from . import unbalanced
 
 # OT functions
-from .lp import emd, emd2
+from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d
 from .bregman import sinkhorn, sinkhorn2, barycenter
+from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced, sinkhorn_unbalanced2
 from .da import sinkhorn_lpl1_mm
 
 # utils functions
 from .utils import dist, unif, tic, toc, toq
 
-__version__ = "0.5.1"
+__version__ = "0.6.0"
 
-__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
+__all__ = ['emd', 'emd2', 'emd_1d', 'sinkhorn', 'sinkhorn2', 'utils', 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
-           'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim']
+           'emd_1d', 'emd2_1d', 'wasserstein_1d',
+           'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim',
+           'sinkhorn_unbalanced', 'barycenter_unbalanced',
+           'sinkhorn_unbalanced2']
diff --git a/ot/bregman.py b/ot/bregman.py
index d1057ff..2707b7c 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -5,15 +5,22 @@ Bregman projections for regularized OT
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
 #         Nicolas Courty <ncourty@irisa.fr>
+#         Kilian Fatras <kilian.fatras@irisa.fr>
+#         Titouan Vayer <titouan.vayer@irisa.fr>
+#         Hicham Janati <hicham.janati@inria.fr>
+#         Mokhtar Z. Alaya <mokhtarzahdi.alaya@gmail.com>
 #
 # License: MIT License
 
 import numpy as np
+import warnings
+from .utils import unif, dist
+from scipy.optimize import fmin_l_bfgs_b
 
 
 def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
              stopThr=1e-9, verbose=False, log=False, **kwargs):
-    u"""
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The function solves the following optimization problem:
@@ -28,21 +35,21 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_
 
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (dim_b,) or ndarray, shape (dim_b, n_hists)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
@@ -61,7 +68,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -70,12 +77,12 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -100,34 +107,28 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     """
 
     if method.lower() == 'sinkhorn':
-        def sink():
-            return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
-                                  stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
+                              stopThr=stopThr, verbose=verbose, log=log,
+                              **kwargs)
     elif method.lower() == 'greenkhorn':
-        def sink():
-            return greenkhorn(a, b, M, reg, numItermax=numItermax,
-                              stopThr=stopThr, verbose=verbose, log=log)
+        return greenkhorn(a, b, M, reg, numItermax=numItermax,
+                          stopThr=stopThr, verbose=verbose, log=log)
     elif method.lower() == 'sinkhorn_stabilized':
-        def sink():
-            return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
-                                       stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
+                                   stopThr=stopThr, verbose=verbose,
+                                   log=log, **kwargs)
     elif method.lower() == 'sinkhorn_epsilon_scaling':
-        def sink():
-            return sinkhorn_epsilon_scaling(
-                a, b, M, reg, numItermax=numItermax,
-                stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_epsilon_scaling(a, b, M, reg,
+                                        numItermax=numItermax,
+                                        stopThr=stopThr, verbose=verbose,
+                                        log=log, **kwargs)
     else:
-        print('Warning : unknown method using classic Sinkhorn Knopp')
-
-        def sink():
-            return sinkhorn_knopp(a, b, M, reg, **kwargs)
-
-    return sink()
+        raise ValueError("Unknown method '%s'." % method)
 
 
 def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
               stopThr=1e-9, verbose=False, log=False, **kwargs):
-    u"""
+    r"""
     Solve the entropic regularization optimal transport problem and return the loss
 
     The function solves the following optimization problem:
@@ -142,21 +143,21 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_
 
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (dim_b,) or ndarray, shape (dim_b, n_hists)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
@@ -172,11 +173,10 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    W : (nt) ndarray or float
-        Optimal transportation matrix for the given parameters
+    W : (n_hists) ndarray or float
+        Optimal transportation loss for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
@@ -184,11 +184,11 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn2(a,b,M,1)
-    array([ 0.26894142])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn2(a, b, M, 1)
+    array([0.26894142])
 
 
 
@@ -215,36 +215,28 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     ot.bregman.sinkhorn_epsilon_scaling: Sinkhorn with epslilon scaling [9][10]
 
     """
-
+    b = np.asarray(b, dtype=np.float64)
+    if len(b.shape) < 2:
+        b = b[:, None]
     if method.lower() == 'sinkhorn':
-        def sink():
-            return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
-                                  stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
+                              stopThr=stopThr, verbose=verbose, log=log,
+                              **kwargs)
     elif method.lower() == 'sinkhorn_stabilized':
-        def sink():
-            return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
-                                       stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
+                                   stopThr=stopThr, verbose=verbose, log=log,
+                                   **kwargs)
     elif method.lower() == 'sinkhorn_epsilon_scaling':
-        def sink():
-            return sinkhorn_epsilon_scaling(
-                a, b, M, reg, numItermax=numItermax,
-                stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=numItermax,
+                                        stopThr=stopThr, verbose=verbose,
+                                        log=log, **kwargs)
     else:
-        print('Warning : unknown method using classic Sinkhorn Knopp')
-
-        def sink():
-            return sinkhorn_knopp(a, b, M, reg, **kwargs)
-
-    b = np.asarray(b, dtype=np.float64)
-    if len(b.shape) < 2:
-        b = b.reshape((-1, 1))
-
-    return sink()
+        raise ValueError("Unknown method '%s'." % method)
 
 
 def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
                    stopThr=1e-9, verbose=False, log=False, **kwargs):
-    """
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The function solves the following optimization problem:
@@ -259,21 +251,21 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_
 
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (dim_b,) or ndarray, shape (dim_b, n_hists)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
@@ -286,10 +278,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -298,12 +289,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -329,25 +320,25 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
     # init data
-    Nini = len(a)
-    Nfin = len(b)
+    dim_a = len(a)
+    dim_b = len(b)
 
     if len(b.shape) > 1:
-        nbb = b.shape[1]
+        n_hists = b.shape[1]
     else:
-        nbb = 0
+        n_hists = 0
 
     if log:
         log = {'err': []}
 
     # we assume that no distances are null except those of the diagonal of
     # distances
-    if nbb:
-        u = np.ones((Nini, nbb)) / Nini
-        v = np.ones((Nfin, nbb)) / Nfin
+    if n_hists:
+        u = np.ones((dim_a, n_hists)) / dim_a
+        v = np.ones((dim_b, n_hists)) / dim_b
     else:
-        u = np.ones(Nini) / Nini
-        v = np.ones(Nfin) / Nfin
+        u = np.ones(dim_a) / dim_a
+        v = np.ones(dim_b) / dim_b
 
     # print(reg)
 
@@ -370,9 +361,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         v = np.divide(b, KtransposeU)
         u = 1. / np.dot(Kp, v)
 
-        if (np.any(KtransposeU == 0) or
-                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
-                np.any(np.isinf(u)) or np.any(np.isinf(v))):
+        if (np.any(KtransposeU == 0)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
@@ -382,13 +373,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            if nbb:
-                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
-                    np.sum((v - vprev)**2) / np.sum((v)**2)
+            if n_hists:
+                np.einsum('ik,ij,jk->jk', u, K, v, out=tmp2)
             else:
                 # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
                 np.einsum('i,ij,j->j', u, K, v, out=tmp2)
-                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
+            err = np.linalg.norm(tmp2 - b)  # violation of marginal
             if log:
                 log['err'].append(err)
 
@@ -402,7 +392,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         log['u'] = u
         log['v'] = v
 
-    if nbb:  # return only loss
+    if n_hists:  # return only loss
         res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
         if log:
             return res, log
@@ -417,8 +407,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
             return u.reshape((-1, 1)) * K * v.reshape((1, -1))
 
 
-def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=False):
-    """
+def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
+               log=False):
+    r"""
     Solve the entropic regularization optimal transport problem and return the OT matrix
 
     The algorithm used is based on the paper
@@ -441,20 +432,20 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
 
 
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+    b : ndarray, shape (dim_b,) or ndarray, shape (dim_b, n_hists)
         samples in the target domain, compute sinkhorn with multiple targets
         and fixed M if b is a matrix (return OT loss + dual variables in log)
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
@@ -465,10 +456,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -477,12 +467,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.bregman.greenkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.bregman.greenkhorn(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -499,22 +489,33 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
 
     """
 
-    n = a.shape[0]
-    m = b.shape[0]
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    if len(a) == 0:
+        a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
+
+    dim_a = a.shape[0]
+    dim_b = b.shape[0]
 
     # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
     K = np.empty_like(M)
     np.divide(M, -reg, out=K)
     np.exp(K, out=K)
 
-    u = np.full(n, 1. / n)
-    v = np.full(m, 1. / m)
+    u = np.full(dim_a, 1. / dim_a)
+    v = np.full(dim_b, 1. / dim_b)
     G = u[:, np.newaxis] * K * v[np.newaxis, :]
 
     viol = G.sum(1) - a
     viol_2 = G.sum(0) - b
     stopThr_val = 1
+
     if log:
+        log = dict()
         log['u'] = u
         log['v'] = v
 
@@ -560,8 +561,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=
 
 
 def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
-                        warmstart=None, verbose=False, print_period=20, log=False, **kwargs):
-    """
+                        warmstart=None, verbose=False, print_period=20,
+                        log=False, **kwargs):
+    r"""
     Solve the entropic regularization OT problem with log stabilization
 
     The function solves the following optimization problem:
@@ -576,9 +578,10 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
+
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
     scaling algorithm as proposed in [2]_ but with the log stabilization
@@ -587,11 +590,11 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (dim_b,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
@@ -608,10 +611,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -623,9 +625,9 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     >>> a=[.5,.5]
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.bregman.sinkhorn_stabilized(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> ot.bregman.sinkhorn_stabilized(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -656,14 +658,14 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     # test if multiple target
     if len(b.shape) > 1:
-        nbb = b.shape[1]
+        n_hists = b.shape[1]
         a = a[:, np.newaxis]
     else:
-        nbb = 0
+        n_hists = 0
 
     # init data
-    na = len(a)
-    nb = len(b)
+    dim_a = len(a)
+    dim_b = len(b)
 
     cpt = 0
     if log:
@@ -672,24 +674,25 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     # we assume that no distances are null except those of the diagonal of
     # distances
     if warmstart is None:
-        alpha, beta = np.zeros(na), np.zeros(nb)
+        alpha, beta = np.zeros(dim_a), np.zeros(dim_b)
     else:
         alpha, beta = warmstart
 
-    if nbb:
-        u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
+    if n_hists:
+        u = np.ones((dim_a, n_hists)) / dim_a
+        v = np.ones((dim_b, n_hists)) / dim_b
     else:
-        u, v = np.ones(na) / na, np.ones(nb) / nb
+        u, v = np.ones(dim_a) / dim_a, np.ones(dim_b) / dim_b
 
     def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) -
-                        beta.reshape((1, nb))) / reg)
+        return np.exp(-(M - alpha.reshape((dim_a, 1))
+                        - beta.reshape((1, dim_b))) / reg)
 
     def get_Gamma(alpha, beta, u, v):
         """log space gamma computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) /
-                      reg + np.log(u.reshape((na, 1))) + np.log(v.reshape((1, nb))))
+        return np.exp(-(M - alpha.reshape((dim_a, 1)) - beta.reshape((1, dim_b)))
+                      / reg + np.log(u.reshape((dim_a, 1))) + np.log(v.reshape((1, dim_b))))
 
     # print(np.min(K))
 
@@ -709,26 +712,29 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
         # remove numerical problems and store them in K
         if np.abs(u).max() > tau or np.abs(v).max() > tau:
-            if nbb:
+            if n_hists:
                 alpha, beta = alpha + reg * \
                     np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
             else:
                 alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
-                if nbb:
-                    u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
+                if n_hists:
+                    u, v = np.ones((dim_a, n_hists)) / dim_a, np.ones((dim_b, n_hists)) / dim_b
                 else:
-                    u, v = np.ones(na) / na, np.ones(nb) / nb
+                    u, v = np.ones(dim_a) / dim_a, np.ones(dim_b) / dim_b
             K = get_K(alpha, beta)
 
         if cpt % print_period == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            if nbb:
-                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
-                    np.sum((v - vprev)**2) / np.sum((v)**2)
+            if n_hists:
+                err_u = abs(u - uprev).max()
+                err_u /= max(abs(u).max(), abs(uprev).max(), 1.)
+                err_v = abs(v - vprev).max()
+                err_v /= max(abs(v).max(), abs(vprev).max(), 1.)
+                err = 0.5 * (err_u + err_v)
             else:
                 transp = get_Gamma(alpha, beta, u, v)
-                err = np.linalg.norm((np.sum(transp, axis=0) - b))**2
+                err = np.linalg.norm((np.sum(transp, axis=0) - b))
             if log:
                 log['err'].append(err)
 
@@ -754,34 +760,40 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
         cpt = cpt + 1
 
-    # print('err=',err,' cpt=',cpt)
     if log:
-        log['logu'] = alpha / reg + np.log(u)
-        log['logv'] = beta / reg + np.log(v)
+        if n_hists:
+            alpha = alpha[:, None]
+            beta = beta[:, None]
+        logu = alpha / reg + np.log(u)
+        logv = beta / reg + np.log(v)
+        log['logu'] = logu
+        log['logv'] = logv
         log['alpha'] = alpha + reg * np.log(u)
         log['beta'] = beta + reg * np.log(v)
         log['warmstart'] = (log['alpha'], log['beta'])
-        if nbb:
-            res = np.zeros((nbb))
-            for i in range(nbb):
+        if n_hists:
+            res = np.zeros((n_hists))
+            for i in range(n_hists):
                 res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
             return res, log
 
         else:
             return get_Gamma(alpha, beta, u, v), log
     else:
-        if nbb:
-            res = np.zeros((nbb))
-            for i in range(nbb):
+        if n_hists:
+            res = np.zeros((n_hists))
+            for i in range(n_hists):
                 res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
             return res
         else:
             return get_Gamma(alpha, beta, u, v)
 
 
-def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInnerItermax=100,
-                             tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=10, log=False, **kwargs):
-    """
+def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
+                             numInnerItermax=100, tau=1e3, stopThr=1e-9,
+                             warmstart=None, verbose=False, print_period=10,
+                             log=False, **kwargs):
+    r"""
     Solve the entropic regularization optimal transport problem with log
     stabilization and epsilon scaling.
 
@@ -797,9 +809,10 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
              \gamma\geq 0
     where :
 
-    - M is the (ns,nt) metric cost matrix
+    - M is the (dim_a, dim_b) metric cost matrix
     - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - a and b are source and target weights (sum to 1)
+    - a and b are source and target weights (histograms, both sum to 1)
+
 
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
     scaling algorithm as proposed in [2]_ but with the log stabilization
@@ -808,19 +821,17 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (dim_a,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (dim_b,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (dim_a, dim_b)
         loss matrix
     reg : float
         Regularization term >0
     tau : float
         thershold for max value in u or v for log scaling
-    tau : float
-        thershold for max value in u or v for log scaling
-    warmstart : tible of vectors
+    warmstart : tuple of vectors
         if given then sarting values for alpha an beta log scalings
     numItermax : int, optional
         Max number of iterations
@@ -835,10 +846,9 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
@@ -847,12 +857,12 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     --------
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.bregman.sinkhorn_epsilon_scaling(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.bregman.sinkhorn_epsilon_scaling(a, b, M, 1)
+    array([[0.36552929, 0.13447071],
+           [0.13447071, 0.36552929]])
 
 
     References
@@ -879,8 +889,8 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
     # init data
-    na = len(a)
-    nb = len(b)
+    dim_a = len(a)
+    dim_b = len(b)
 
     # nrelative umerical precision with 64 bits
     numItermin = 35
@@ -893,14 +903,14 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
     # we assume that no distances are null except those of the diagonal of
     # distances
     if warmstart is None:
-        alpha, beta = np.zeros(na), np.zeros(nb)
+        alpha, beta = np.zeros(dim_a), np.zeros(dim_b)
     else:
         alpha, beta = warmstart
 
     def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) -
-                        beta.reshape((1, nb))) / reg)
+        return np.exp(-(M - alpha.reshape((dim_a, 1))
+                        - beta.reshape((1, dim_b))) / reg)
 
     # print(np.min(K))
     def get_reg(n):  # exponential decreasing
@@ -913,8 +923,10 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
 
         regi = get_reg(cpt)
 
-        G, logi = sinkhorn_stabilized(a, b, M, regi, numItermax=numInnerItermax, stopThr=1e-9, warmstart=(
-            alpha, beta), verbose=False, print_period=20, tau=tau, log=True)
+        G, logi = sinkhorn_stabilized(a, b, M, regi,
+                                      numItermax=numInnerItermax, stopThr=1e-9,
+                                      warmstart=(alpha, beta), verbose=False,
+                                      print_period=20, tau=tau, log=True)
 
         alpha = logi['alpha']
         beta = logi['beta']
@@ -972,9 +984,9 @@ def projC(gamma, q):
     return np.multiply(gamma, q / np.maximum(np.sum(gamma, axis=0), 1e-10))
 
 
-def barycenter(A, M, reg, weights=None, numItermax=1000,
-               stopThr=1e-4, verbose=False, log=False):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
+               stopThr=1e-4, verbose=False, log=False, **kwargs):
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
 
      The function solves the following optimization problem:
 
@@ -991,13 +1003,15 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
     Parameters
     ----------
-    A : np.ndarray (d,n)
-        n training distributions a_i of size d
-    M : np.ndarray (d,d)
-        loss matrix   for OT
+    A : ndarray, shape (dim, n_hists)
+        n_hists training distributions a_i of size dim
+    M : ndarray, shape (dim, dim)
+        loss matrix for OT
     reg : float
-        Regularization term >0
-    weights : np.ndarray (n,)
+        Regularization term > 0
+    method : str (optional)
+        method used for the solver either 'sinkhorn' or 'sinkhorn_stabilized'
+    weights : ndarray, shape (n_hists,)
         Weights of each histogram a_i on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
@@ -1011,7 +1025,7 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
     Returns
     -------
-    a : (d,) ndarray
+    a : (dim,) ndarray
         Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
@@ -1022,7 +1036,71 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
 
     .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
 
+    """
 
+    if method.lower() == 'sinkhorn':
+        return barycenter_sinkhorn(A, M, reg, weights=weights,
+                                   numItermax=numItermax,
+                                   stopThr=stopThr, verbose=verbose, log=log,
+                                   **kwargs)
+    elif method.lower() == 'sinkhorn_stabilized':
+        return barycenter_stabilized(A, M, reg, weights=weights,
+                                     numItermax=numItermax,
+                                     stopThr=stopThr, verbose=verbose,
+                                     log=log, **kwargs)
+    else:
+        raise ValueError("Unknown method '%s'." % method)
+
+
+def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
+                        stopThr=1e-4, verbose=False, log=False):
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
+
+    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [3]_
+
+    Parameters
+    ----------
+    A : ndarray, shape (dim, n_hists)
+        n_hists training distributions a_i of size dim
+    M : ndarray, shape (dim, dim)
+        loss matrix for OT
+    reg : float
+        Regularization term > 0
+    weights : ndarray, shape (n_hists,)
+        Weights of each histogram a_i on the simplex (barycentric coodinates)
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (dim,) ndarray
+        Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
 
     """
 
@@ -1068,8 +1146,139 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
         return geometricBar(weights, UKv)
 
 
-def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1e-9, stabThr=1e-30, verbose=False, log=False):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
+                          stopThr=1e-4, verbose=False, log=False):
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
+        with stabilization.
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT
+
+    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [3]_
+
+    Parameters
+    ----------
+    A : ndarray, shape (dim, n_hists)
+        n_hists training distributions a_i of size dim
+    M : ndarray, shape (dim, dim)
+        loss matrix for OT
+    reg : float
+        Regularization term > 0
+    tau : float
+        thershold for max value in u or v for log scaling
+    weights : ndarray, shape (n_hists,)
+        Weights of each histogram a_i on the simplex (barycentric coodinates)
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (dim,) ndarray
+        Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. (2015). Iterative Bregman projections for regularized transportation problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+
+    """
+
+    dim, n_hists = A.shape
+    if weights is None:
+        weights = np.ones(n_hists) / n_hists
+    else:
+        assert(len(weights) == A.shape[1])
+
+    if log:
+        log = {'err': []}
+
+    u = np.ones((dim, n_hists)) / dim
+    v = np.ones((dim, n_hists)) / dim
+
+    # print(reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    cpt = 0
+    err = 1.
+    alpha = np.zeros(dim)
+    beta = np.zeros(dim)
+    q = np.ones(dim) / dim
+    while (err > stopThr and cpt < numItermax):
+        qprev = q
+        Kv = K.dot(v)
+        u = A / (Kv + 1e-16)
+        Ktu = K.T.dot(u)
+        q = geometricBar(weights, Ktu)
+        Q = q[:, None]
+        v = Q / (Ktu + 1e-16)
+        absorbing = False
+        if (u > tau).any() or (v > tau).any():
+            absorbing = True
+            alpha = alpha + reg * np.log(np.max(u, 1))
+            beta = beta + reg * np.log(np.max(v, 1))
+            K = np.exp((alpha[:, None] + beta[None, :] -
+                        M) / reg)
+            v = np.ones_like(v)
+        Kv = K.dot(v)
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % cpt)
+            q = qprev
+            break
+        if (cpt % 10 == 0 and not absorbing) or cpt == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = abs(u * Kv - A).max()
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if cpt % 50 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+        cpt += 1
+    if err > stopThr:
+        warnings.warn("Stabilized Unbalanced Sinkhorn did not converge." +
+                      "Try a larger entropy `reg`" +
+                      "Or a larger absorption threshold `tau`.")
+    if log:
+        log['niter'] = cpt
+        log['logu'] = np.log(u + 1e-16)
+        log['logv'] = np.log(v + 1e-16)
+        return q, log
+    else:
+        return q
+
+
+def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
+                               stopThr=1e-9, stabThr=1e-30, verbose=False,
+                               log=False):
+    r"""Compute the entropic regularized wasserstein barycenter of distributions A
     where A is a collection of 2D images.
 
      The function solves the following optimization problem:
@@ -1087,16 +1296,16 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
 
     Parameters
     ----------
-    A : np.ndarray (n,w,h)
-        n distributions (2D images) of size w x h
+    A : ndarray, shape (n_hists, width, height)
+        n distributions (2D images) of size width x height
     reg : float
         Regularization term >0
-    weights : np.ndarray (n,)
+    weights : ndarray, shape (n_hists,)
         Weights of each image on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on error (> 0)
     stabThr : float, optional
         Stabilization threshold to avoid numerical precision issue
     verbose : bool, optional
@@ -1104,15 +1313,13 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
     log : bool, optional
         record log if True
 
-
     Returns
     -------
-    a : (w,h) ndarray
+    a : ndarray, shape (width, height)
         2D Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
 
@@ -1180,7 +1387,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1
 
 def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
           stopThr=1e-3, verbose=False, log=False):
-    """
+    r"""
     Compute the unmixing of an observation with a given dictionary using Wasserstein distance
 
     The function solve the following optimization problem:
@@ -1192,9 +1399,12 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
     where :
 
     - :math:`W_{M,reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance with M loss matrix (see ot.bregman.sinkhorn)
-    - :math:`\mathbf{a}` is an observed distribution,  :math:`\mathbf{h}_0` is aprior on unmixing
-    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix for OT data fitting
-    - reg0 and :math:`\mathbf{M0}` are respectively the regularization term and the cost matrix for regularization
+    - :math: `\mathbf{D}` is a dictionary of `n_atoms` atoms of dimension `dim_a`, its expected shape is `(dim_a, n_atoms)`
+    - :math:`\mathbf{h}` is the estimated unmixing of dimension `n_atoms`
+    - :math:`\mathbf{a}` is an observed distribution of dimension `dim_a`
+    - :math:`\mathbf{h}_0` is a prior on `h` of dimension `dim_prior`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and the cost matrix (dim_a, dim_a) for OT data fitting
+    - reg0 and :math:`\mathbf{M0}` are respectively the regularization term and the cost matrix (dim_prior, n_atoms) regularization
     - :math:`\\alpha`weight data fitting and regularization
 
     The optimization problem is solved suing the algorithm described in [4]
@@ -1202,16 +1412,16 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Parameters
     ----------
-    a : np.ndarray (d)
-        observed distribution
-    D : np.ndarray (d,n)
+    a : ndarray, shape (dim_a)
+        observed distribution (histogram, sums to 1)
+    D : ndarray, shape (dim_a, n_atoms)
         dictionary matrix
-    M : np.ndarray (d,d)
+    M : ndarray, shape (dim_a, dim_a)
         loss matrix
-    M0 : np.ndarray (n,n)
+    M0 : ndarray, shape (n_atoms, dim_prior)
         loss matrix
-    h0 : np.ndarray (n,)
-        prior on h
+    h0 : ndarray, shape (n_atoms,)
+        prior on the estimated unmixing h
     reg : float
         Regularization term >0 (Wasserstein data fitting)
     reg0 : float
@@ -1230,7 +1440,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     Returns
     -------
-    a : (d,) ndarray
+    h : ndarray, shape (n_atoms,)
         Wasserstein barycenter
     log : dict
         log dictionary return only if log==True in parameters
@@ -1284,3 +1494,635 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
         return np.sum(K0, axis=1), log
     else:
         return np.sum(K0, axis=1)
+
+
+def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
+                       numIterMax=10000, stopThr=1e-9, verbose=False,
+                       log=False, **kwargs):
+    r'''
+    Solve the entropic regularization optimal transport problem and return the
+    OT matrix from empirical data
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - :math:`M` is the (n_samples_a, n_samples_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : ndarray, shape (n_samples_a, dim)
+        samples in the source domain
+    X_t : ndarray, shape (n_samples_b, dim)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : ndarray, shape (n_samples_a,)
+        samples weights in the source domain
+    b : ndarray, shape (n_samples_b,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : ndarray, shape (n_samples_a, n_samples_b)
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_samples_a = 2
+    >>> n_samples_b = 2
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> empirical_sinkhorn(X_s, X_t, reg, verbose=False)  # doctest: +NORMALIZE_WHITESPACE
+    array([[4.99977301e-01,  2.26989344e-05],
+           [2.26989344e-05,  4.99977301e-01]])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+    '''
+
+    if a is None:
+        a = unif(np.shape(X_s)[0])
+    if b is None:
+        b = unif(np.shape(X_t)[0])
+
+    M = dist(X_s, X_t, metric=metric)
+
+    if log:
+        pi, log = sinkhorn(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=True, **kwargs)
+        return pi, log
+    else:
+        pi = sinkhorn(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=False, **kwargs)
+        return pi
+
+
+def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+    r'''
+    Solve the entropic regularization optimal transport problem from empirical
+    data and return the OT loss
+
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - :math:`M` is the (n_samples_a, n_samples_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : ndarray, shape (n_samples_a, dim)
+        samples in the source domain
+    X_t : ndarray, shape (n_samples_b, dim)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : ndarray, shape (n_samples_a,)
+        samples weights in the source domain
+    b : ndarray, shape (n_samples_b,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : ndarray, shape (n_samples_a, n_samples_b)
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_samples_a = 2
+    >>> n_samples_b = 2
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> empirical_sinkhorn2(X_s, X_t, reg, verbose=False)
+    array([4.53978687e-05])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+    '''
+
+    if a is None:
+        a = unif(np.shape(X_s)[0])
+    if b is None:
+        b = unif(np.shape(X_t)[0])
+
+    M = dist(X_s, X_t, metric=metric)
+
+    if log:
+        sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_loss, log
+    else:
+        sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_loss
+
+
+def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+    r'''
+    Compute the sinkhorn divergence loss from empirical data
+
+    The function solves the following optimization problems and return the
+    sinkhorn divergence :math:`S`:
+
+    .. math::
+
+        W &= \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        W_a &= \min_{\gamma_a} <\gamma_a,M_a>_F + reg\cdot\Omega(\gamma_a)
+
+        W_b &= \min_{\gamma_b} <\gamma_b,M_b>_F + reg\cdot\Omega(\gamma_b)
+
+        S &= W - 1/2 * (W_a + W_b)
+
+    .. math::
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+
+             \gamma_a 1 = a
+
+             \gamma_a^T 1= a
+
+             \gamma_a\geq 0
+
+             \gamma_b 1 = b
+
+             \gamma_b^T 1= b
+
+             \gamma_b\geq 0
+    where :
+
+    - :math:`M` (resp. :math:`M_a, M_b`) is the (n_samples_a, n_samples_b) metric cost matrix (resp (n_samples_a, n_samples_a) and (n_samples_b, n_samples_b))
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : ndarray, shape (n_samples_a, dim)
+        samples in the source domain
+    X_t : ndarray, shape (n_samples_b, dim)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : ndarray, shape (n_samples_a,)
+        samples weights in the source domain
+    b : ndarray, shape (n_samples_b,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+    gamma : ndarray, shape (n_samples_a, n_samples_b)
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+    >>> n_samples_a = 2
+    >>> n_samples_b = 4
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> empirical_sinkhorn_divergence(X_s, X_t, reg)  # doctest: +ELLIPSIS
+    array([1.499...])
+
+
+    References
+    ----------
+    .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences,  Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018
+    '''
+    if log:
+        sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
+
+        log = {}
+        log['sinkhorn_loss_ab'] = sinkhorn_loss_ab
+        log['sinkhorn_loss_a'] = sinkhorn_loss_a
+        log['sinkhorn_loss_b'] = sinkhorn_loss_b
+        log['log_sinkhorn_ab'] = log_ab
+        log['log_sinkhorn_a'] = log_a
+        log['log_sinkhorn_b'] = log_b
+
+        return max(0, sinkhorn_div), log
+
+    else:
+        sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
+        return max(0, sinkhorn_div)
+
+
+def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, restricted=True,
+                maxiter=10000, maxfun=10000, pgtol=1e-09, verbose=False, log=False):
+    r""""
+    Screening Sinkhorn Algorithm for Regularized Optimal Transport
+
+    The function solves an approximate dual of Sinkhorn divergence [2] which is written as the following optimization problem:
+
+    ..math::
+      (u, v) = \argmin_{u, v} 1_{ns}^T B(u,v) 1_{nt} - <\kappa u, a> - <v/\kappa, b>
+
+      where B(u,v) = \diag(e^u) K \diag(e^v), with K = e^{-M/reg} and
+
+      s.t. e^{u_i} \geq \epsilon / \kappa, for all i \in {1, ..., ns}
+
+           e^{v_j} \geq \epsilon \kappa, for all j \in {1, ..., nt}
+
+      The parameters \kappa and \epsilon are determined w.r.t the couple number budget of points (ns_budget, nt_budget), see Equation (5) in [26]
+
+
+    Parameters
+    ----------
+    a : `numpy.ndarray`, shape=(ns,)
+        samples weights in the source domain
+
+    b : `numpy.ndarray`, shape=(nt,)
+        samples weights in the target domain
+
+    M : `numpy.ndarray`, shape=(ns, nt)
+        Cost matrix
+
+    reg : `float`
+        Level of the entropy regularisation
+
+    ns_budget : `int`, deafult=None
+        Number budget of points to be keeped in the source domain
+        If it is None then 50% of the source sample points will be keeped
+
+    nt_budget : `int`, deafult=None
+        Number budget of points to be keeped in the target domain
+        If it is None then 50% of the target sample points will be keeped
+
+    uniform : `bool`, default=False
+        If `True`, the source and target distribution are supposed to be uniform, i.e., a_i = 1 / ns and b_j = 1 / nt
+
+    restricted : `bool`, default=True
+         If `True`, a warm-start initialization for the  L-BFGS-B solver
+         using a restricted Sinkhorn algorithm with at most 5 iterations
+
+    maxiter : `int`, default=10000
+      Maximum number of iterations in LBFGS solver
+
+    maxfun : `int`, default=10000
+      Maximum  number of function evaluations in LBFGS solver
+
+    pgtol : `float`, default=1e-09
+      Final objective function accuracy in LBFGS solver
+
+    verbose : `bool`, default=False
+        If `True`, dispaly informations about the cardinals of the active sets and the paramerters kappa
+        and epsilon
+
+    Dependency
+    ----------
+    To gain more efficiency, screenkhorn needs to call the "Bottleneck" package (https://pypi.org/project/Bottleneck/)
+    in the screening pre-processing step. If Bottleneck isn't installed, the following error message appears:
+    "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/"
+
+
+    Returns
+    -------
+    gamma : `numpy.ndarray`, shape=(ns, nt)
+        Screened optimal transportation matrix for the given parameters
+
+    log : `dict`, default=False
+      Log dictionary return only if log==True in parameters
+
+
+    References
+    -----------
+    .. [26] Alaya M. Z., Bérar M., Gasso G., Rakotomamonjy A. (2019). Screening Sinkhorn Algorithm for Regularized Optimal Transport (NIPS) 33, 2019
+
+    """
+    # check if bottleneck module exists
+    try:
+        import bottleneck
+    except ImportError:
+        warnings.warn("Bottleneck module is not installed. Install it from https://pypi.org/project/Bottleneck/ for better performance.")
+        bottleneck = np
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+    ns, nt = M.shape
+
+    # by default, we keep only 50% of the sample data points
+    if ns_budget is None:
+        ns_budget = int(np.floor(0.5 * ns))
+    if nt_budget is None:
+        nt_budget = int(np.floor(0.5 * nt))
+
+    # calculate the Gibbs kernel
+    K = np.empty_like(M)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    def projection(u, epsilon):
+        u[u <= epsilon] = epsilon
+        return u
+
+    # ----------------------------------------------------------------------------------------------------------------#
+    #                                          Step 1: Screening pre-processing                                       #
+    # ----------------------------------------------------------------------------------------------------------------#
+
+    if ns_budget == ns and nt_budget == nt:
+        # full number of budget points (ns, nt) = (ns_budget, nt_budget)
+        Isel = np.ones(ns, dtype=bool)
+        Jsel = np.ones(nt, dtype=bool)
+        epsilon = 0.0
+        kappa = 1.0
+
+        cst_u = 0.
+        cst_v = 0.
+
+        bounds_u = [(0.0, np.inf)] * ns
+        bounds_v = [(0.0, np.inf)] * nt
+
+        a_I = a
+        b_J = b
+        K_IJ = K
+        K_IJc = []
+        K_IcJ = []
+
+        vec_eps_IJc = np.zeros(nt)
+        vec_eps_IcJ = np.zeros(ns)
+
+    else:
+        # sum of rows and columns of K
+        K_sum_cols = K.sum(axis=1)
+        K_sum_rows = K.sum(axis=0)
+
+        if uniform:
+            if ns / ns_budget < 4:
+                aK_sort = np.sort(K_sum_cols)
+                epsilon_u_square = a[0] / aK_sort[ns_budget - 1]
+            else:
+                aK_sort = bottleneck.partition(K_sum_cols, ns_budget - 1)[ns_budget - 1]
+                epsilon_u_square = a[0] / aK_sort
+
+            if nt / nt_budget < 4:
+                bK_sort = np.sort(K_sum_rows)
+                epsilon_v_square = b[0] / bK_sort[nt_budget - 1]
+            else:
+                bK_sort = bottleneck.partition(K_sum_rows, nt_budget - 1)[nt_budget - 1]
+                epsilon_v_square = b[0] / bK_sort
+        else:
+            aK = a / K_sum_cols
+            bK = b / K_sum_rows
+
+            aK_sort = np.sort(aK)[::-1]
+            epsilon_u_square = aK_sort[ns_budget - 1]
+
+            bK_sort = np.sort(bK)[::-1]
+            epsilon_v_square = bK_sort[nt_budget - 1]
+
+        # active sets I and J (see Lemma 1 in [26])
+        Isel = a >= epsilon_u_square * K_sum_cols
+        Jsel = b >= epsilon_v_square * K_sum_rows
+
+        if sum(Isel) != ns_budget:
+            if uniform:
+                aK = a / K_sum_cols
+                aK_sort = np.sort(aK)[::-1]
+            epsilon_u_square = aK_sort[ns_budget - 1:ns_budget + 1].mean()
+            Isel = a >= epsilon_u_square * K_sum_cols
+            ns_budget = sum(Isel)
+
+        if sum(Jsel) != nt_budget:
+            if uniform:
+                bK = b / K_sum_rows
+                bK_sort = np.sort(bK)[::-1]
+            epsilon_v_square = bK_sort[nt_budget - 1:nt_budget + 1].mean()
+            Jsel = b >= epsilon_v_square * K_sum_rows
+            nt_budget = sum(Jsel)
+
+        epsilon = (epsilon_u_square * epsilon_v_square) ** (1 / 4)
+        kappa = (epsilon_v_square / epsilon_u_square) ** (1 / 2)
+
+        if verbose:
+            print("epsilon = %s\n" % epsilon)
+            print("kappa = %s\n" % kappa)
+            print('Cardinality of selected points: |Isel| = %s \t |Jsel| = %s \n' % (sum(Isel), sum(Jsel)))
+
+        # Ic, Jc: complementary of the active sets I and J
+        Ic = ~Isel
+        Jc = ~Jsel
+
+        K_IJ = K[np.ix_(Isel, Jsel)]
+        K_IcJ = K[np.ix_(Ic, Jsel)]
+        K_IJc = K[np.ix_(Isel, Jc)]
+
+        K_min = K_IJ.min()
+        if K_min == 0:
+            K_min = np.finfo(float).tiny
+
+        # a_I, b_J, a_Ic, b_Jc
+        a_I = a[Isel]
+        b_J = b[Jsel]
+        if not uniform:
+            a_I_min = a_I.min()
+            a_I_max = a_I.max()
+            b_J_max = b_J.max()
+            b_J_min = b_J.min()
+        else:
+            a_I_min = a_I[0]
+            a_I_max = a_I[0]
+            b_J_max = b_J[0]
+            b_J_min = b_J[0]
+
+        # box constraints in L-BFGS-B (see Proposition 1 in [26])
+        bounds_u = [(max(a_I_min / ((nt - nt_budget) * epsilon + nt_budget * (b_J_max / (
+            ns * epsilon * kappa * K_min))), epsilon / kappa), a_I_max / (nt * epsilon * K_min))] * ns_budget
+
+        bounds_v = [(max(b_J_min / ((ns - ns_budget) * epsilon + ns_budget * (kappa * a_I_max / (nt * epsilon * K_min))),
+                         epsilon * kappa), b_J_max / (ns * epsilon * K_min))] * nt_budget
+
+        # pre-calculated constants for the objective
+        vec_eps_IJc = epsilon * kappa * (K_IJc * np.ones(nt - nt_budget).reshape((1, -1))).sum(axis=1)
+        vec_eps_IcJ = (epsilon / kappa) * (np.ones(ns - ns_budget).reshape((-1, 1)) * K_IcJ).sum(axis=0)
+
+    # initialisation
+    u0 = np.full(ns_budget, (1. / ns_budget) + epsilon / kappa)
+    v0 = np.full(nt_budget, (1. / nt_budget) + epsilon * kappa)
+
+    # pre-calculed constants for Restricted Sinkhorn (see Algorithm 1 in supplementary of [26])
+    if restricted:
+        if ns_budget != ns or nt_budget != nt:
+            cst_u = kappa * epsilon * K_IJc.sum(axis=1)
+            cst_v = epsilon * K_IcJ.sum(axis=0) / kappa
+
+        cpt = 1
+        while cpt < 5:  # 5 iterations
+            K_IJ_v = np.dot(K_IJ.T, u0) + cst_v
+            v0 = b_J / (kappa * K_IJ_v)
+            KIJ_u = np.dot(K_IJ, v0) + cst_u
+            u0 = (kappa * a_I) / KIJ_u
+            cpt += 1
+
+        u0 = projection(u0, epsilon / kappa)
+        v0 = projection(v0, epsilon * kappa)
+
+    else:
+        u0 = u0
+        v0 = v0
+
+    def restricted_sinkhorn(usc, vsc, max_iter=5):
+        """
+        Restricted Sinkhorn Algorithm as a warm-start initialized point for L-BFGS-B (see Algorithm 1 in supplementary of [26])
+        """
+        cpt = 1
+        while cpt < max_iter:
+            K_IJ_v = np.dot(K_IJ.T, usc) + cst_v
+            vsc = b_J / (kappa * K_IJ_v)
+            KIJ_u = np.dot(K_IJ, vsc) + cst_u
+            usc = (kappa * a_I) / KIJ_u
+            cpt += 1
+
+        usc = projection(usc, epsilon / kappa)
+        vsc = projection(vsc, epsilon * kappa)
+
+        return usc, vsc
+
+    def screened_obj(usc, vsc):
+        part_IJ = np.dot(np.dot(usc, K_IJ), vsc) - kappa * np.dot(a_I, np.log(usc)) - (1. / kappa) * np.dot(b_J, np.log(vsc))
+        part_IJc = np.dot(usc, vec_eps_IJc)
+        part_IcJ = np.dot(vec_eps_IcJ, vsc)
+        psi_epsilon = part_IJ + part_IJc + part_IcJ
+        return psi_epsilon
+
+    def screened_grad(usc, vsc):
+        # gradients of Psi_(kappa,epsilon) w.r.t u and v
+        grad_u = np.dot(K_IJ, vsc) + vec_eps_IJc - kappa * a_I / usc
+        grad_v = np.dot(K_IJ.T, usc) + vec_eps_IcJ - (1. / kappa) * b_J / vsc
+        return grad_u, grad_v
+
+    def bfgspost(theta):
+        u = theta[:ns_budget]
+        v = theta[ns_budget:]
+        # objective
+        f = screened_obj(u, v)
+        # gradient
+        g_u, g_v = screened_grad(u, v)
+        g = np.hstack([g_u, g_v])
+        return f, g
+
+    #----------------------------------------------------------------------------------------------------------------#
+    #                                           Step 2: L-BFGS-B solver                                              #
+    #----------------------------------------------------------------------------------------------------------------#
+
+    u0, v0 = restricted_sinkhorn(u0, v0)
+    theta0 = np.hstack([u0, v0])
+
+    bounds = bounds_u + bounds_v  # constraint bounds
+
+    def obj(theta):
+        return bfgspost(theta)
+
+    theta, _, _ = fmin_l_bfgs_b(func=obj,
+                                x0=theta0,
+                                bounds=bounds,
+                                maxfun=maxfun,
+                                pgtol=pgtol,
+                                maxiter=maxiter)
+
+    usc = theta[:ns_budget]
+    vsc = theta[ns_budget:]
+
+    usc_full = np.full(ns, epsilon / kappa)
+    vsc_full = np.full(nt, epsilon * kappa)
+    usc_full[Isel] = usc
+    vsc_full[Jsel] = vsc
+
+    if log:
+        log = {}
+        log['u'] = usc_full
+        log['v'] = vsc_full
+        log['Isel'] = Isel
+        log['Jsel'] = Jsel
+
+    gamma = usc_full[:, None] * K * vsc_full[None, :]
+    gamma = gamma / gamma.sum()
+
+    if log:
+        return gamma, log
+    else:
+        return gamma
diff --git a/ot/da.py b/ot/da.py
index bc09e3c..108a38d 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -6,6 +6,7 @@ Domain adaptation with optimal transport
 # Author: Remi Flamary <remi.flamary@unice.fr>
 #         Nicolas Courty <ncourty@irisa.fr>
 #         Michael Perrot <michael.perrot@univ-st-etienne.fr>
+#         Nathalie Gayraud <nat.gayraud@gmail.com>
 #
 # License: MIT License
 
@@ -16,6 +17,7 @@ from .bregman import sinkhorn
 from .lp import emd
 from .utils import unif, dist, kernel, cost_normalization
 from .utils import check_params, BaseEstimator
+from .unbalanced import sinkhorn_unbalanced
 from .optim import cg
 from .optim import gcg
 
@@ -41,15 +43,15 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term
-        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term
+    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e
+      (\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regularization term
       :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
       where  :math:`\mathcal{I}_c` are the index of samples from class c
       in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional
+    The algorithm used for solving the problem is the generalized conditional
     gradient as proposed in  [5]_ [7]_
 
 
@@ -473,22 +475,24 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
         Weight for the linear OT loss (>0)
     eta : float, optional
         Regularization term  for the linear mapping L (>0)
-    bias : bool,optional
-        Estimate linear mapping with constant bias
     kerneltype : str,optional
         kernel used by calling function ot.utils.kernel (gaussian by default)
     sigma : float, optional
         Gaussian kernel bandwidth.
+    bias : bool,optional
+        Estimate linear mapping with constant bias
+    verbose : bool, optional
+        Print information along iterations
+    verbose2 : bool, optional
+        Print information along iterations
     numItermax : int, optional
         Max number of BCD iterations
-    stopThr : float, optional
-        Stop threshold on relative loss decrease (>0)
     numInnerItermax : int, optional
         Max number of iterations (inner CG solver)
     stopInnerThr : float, optional
         Stop threshold on error (inner CG solver) (>0)
-    verbose : bool, optional
-        Print information along iterations
+    stopThr : float, optional
+        Stop threshold on relative loss decrease (>0)
     log : bool, optional
         record log if True
 
@@ -643,7 +647,8 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
     The function estimates the optimal linear operator that aligns the two
     empirical distributions. This is equivalent to estimating the closed
     form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)`
-    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark 2.29 in [15].
+    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark
+    2.29 in [15].
 
     The linear operator from source to target :math:`M`
 
@@ -1184,25 +1189,25 @@ class SinkhornTransport(BaseTransport):
         algorithm if no it has not converged
     tol : float, optional (default=10e-9)
         The precision required to stop the optimization algorithm.
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
+    verbose : bool, optional (default=False)
+        Controls the verbosity of the optimization algorithm
+    log : int, optional (default=False)
+        Controls the logs of the optimization algorithm
     metric : string, optional (default="sqeuclidean")
         The ground metric for the Wasserstein problem
     norm : string, optional (default=None)
         If given, normalize the ground metric to avoid numerical errors that
         can occur with large metric values.
-    distribution : string, optional (default="uniform")
+    distribution_estimation : callable, optional (defaults to the uniform)
         The kind of distribution estimation to employ
-    verbose : int, optional (default=0)
-        Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
-        Controls the logs of the optimization algorithm
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (defaul=np.infty)
         Controls the semi supervised mode. Transport between labeled source
-        and target samples of different classes will exhibit an infinite cost
+        and target samples of different classes will exhibit an cost defined
+        by this variable
 
     Attributes
     ----------
@@ -1287,22 +1292,19 @@ class EMDTransport(BaseTransport):
 
     Parameters
     ----------
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
     metric : string, optional (default="sqeuclidean")
         The ground metric for the Wasserstein problem
     norm : string, optional (default=None)
         If given, normalize the ground metric to avoid numerical errors that
         can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
-    verbose : int, optional (default=0)
-        Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
+    log : int, optional (default=False)
         Controls the logs of the optimization algorithm
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1387,28 +1389,32 @@ class SinkhornLpl1Transport(BaseTransport):
         Entropic regularization parameter
     reg_cl : float, optional (default=0.1)
         Class regularization parameter
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
-    metric : string, optional (default="sqeuclidean")
-        The ground metric for the Wasserstein problem
-    norm : string, optional (default=None)
-        If given, normalize the ground metric to avoid numerical errors that
-        can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
         algorithm if no it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
-    verbose : int, optional (default=0)
+    log : bool, optional (default=False)
+        Controls the logs of the optimization algorithm
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
         Controls the verbosity of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (defaul=np.infty)
         Controls the semi supervised mode. Transport between labeled source
-        and target samples of different classes will exhibit an infinite cost
+        and target samples of different classes will exhibit a cost defined by
+        limit_max.
 
     Attributes
     ----------
@@ -1504,27 +1510,28 @@ class SinkhornL1l2Transport(BaseTransport):
         Entropic regularization parameter
     reg_cl : float, optional (default=0.1)
         Class regularization parameter
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
-    metric : string, optional (default="sqeuclidean")
-        The ground metric for the Wasserstein problem
-    norm : string, optional (default=None)
-        If given, normalize the ground metric to avoid numerical errors that
-        can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
         algorithm if no it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
-    verbose : int, optional (default=0)
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
         Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
+    log : bool, optional (default=False)
         Controls the logs of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1646,10 +1653,12 @@ class MappingTransport(BaseEstimator):
         Max number of iterations (inner CG solver)
     inner_tol : float, optional (default=1e-6)
         Stop threshold on error (inner CG solver) (>0)
-    verbose : bool, optional (default=False)
-        Print information along iterations
     log : bool, optional (default=False)
         record log if True
+    verbose : bool, optional (default=False)
+        Print information along iterations
+    verbose2 : bool, optional (default=False)
+        Print information along iterations
 
     Attributes
     ----------
@@ -1786,3 +1795,122 @@ class MappingTransport(BaseEstimator):
                 transp_Xs = K.dot(self.mapping_)
 
             return transp_Xs
+
+
+class UnbalancedSinkhornTransport(BaseTransport):
+
+    """Domain Adapatation unbalanced OT method based on sinkhorn algorithm
+
+    Parameters
+    ----------
+    reg_e : float, optional (default=1)
+        Entropic regularization parameter
+    reg_m : float, optional (default=0.1)
+        Mass regularization parameter
+    method : str
+        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        'sinkhorn_epsilon_scaling', see those function for specific parameters
+    max_iter : int, float, optional (default=10)
+        The minimum number of iteration before stopping the optimization
+        algorithm if no it has not converged
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
+        Controls the verbosity of the optimization algorithm
+    log : bool, optional (default=False)
+        Controls the logs of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
+    limit_max: float, optional (default=10)
+        Controls the semi supervised mode. Transport between labeled source
+        and target samples of different classes will exhibit an infinite cost
+        (10 times the maximum value of the cost matrix)
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+    log_ : dictionary
+        The dictionary of log, empty dic if parameter log is not True
+
+    References
+    ----------
+
+    .. [1] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+    Scaling algorithms for unbalanced transport problems. arXiv preprint
+    arXiv:1607.05816.
+
+    """
+
+    def __init__(self, reg_e=1., reg_m=0.1, method='sinkhorn',
+                 max_iter=10, tol=1e-9, verbose=False, log=False,
+                 metric="sqeuclidean", norm=None,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans', limit_max=10):
+
+        self.reg_e = reg_e
+        self.reg_m = reg_m
+        self.method = method
+        self.max_iter = max_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.log = log
+        self.metric = metric
+        self.norm = norm
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+        self.limit_max = limit_max
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_target_samples,)
+            The class labels. If some target samples are unlabeled, fill the
+            yt's elements with -1.
+
+            Warning: Note that, due to this convention -1 cannot be used as a
+            class label
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt):
+
+            super(UnbalancedSinkhornTransport, self).fit(Xs, ys, Xt, yt)
+
+            returned_ = sinkhorn_unbalanced(
+                a=self.mu_s, b=self.mu_t, M=self.cost_,
+                reg=self.reg_e, reg_m=self.reg_m, method=self.method,
+                numItermax=self.max_iter, stopThr=self.tol,
+                verbose=self.verbose, log=self.log)
+
+            # deal with the value of log
+            if self.log:
+                self.coupling_, self.log_ = returned_
+            else:
+                self.coupling_ = returned_
+                self.log_ = dict()
+
+        return self
diff --git a/ot/datasets.py b/ot/datasets.py
index e76e75d..ba0cfd9 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -17,7 +17,6 @@ def make_1D_gauss(n, m, s):
 
     Parameters
     ----------
-
     n : int
         number of bins in the histogram
     m : float
@@ -25,12 +24,10 @@ def make_1D_gauss(n, m, s):
     s : float
         standard deviaton of the gaussian distribution
 
-
     Returns
     -------
-    h : np.array (n,)
-          1D histogram for a gaussian distribution
-
+    h : ndarray (n,)
+        1D histogram for a gaussian distribution
     """
     x = np.arange(n, dtype=np.float64)
     h = np.exp(-(x - m)**2 / (2 * s**2))
@@ -44,16 +41,15 @@ def get_1D_gauss(n, m, sigma):
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
-    """return n samples drawn from 2D gaussian N(m,sigma)
+    """Return n samples drawn from 2D gaussian N(m,sigma)
 
     Parameters
     ----------
-
     n : int
         number of samples to make
-    m : np.array (2,)
+    m : ndarray, shape (2,)
         mean value of the gaussian distribution
-    sigma : np.array (2,2)
+    sigma : ndarray, shape (2, 2)
         covariance matrix of the gaussian distribution
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -63,9 +59,8 @@ def make_2D_samples_gauss(n, m, sigma, random_state=None):
 
     Returns
     -------
-    X : np.array (n,2)
-          n samples drawn from  N(m,sigma)
-
+    X : ndarray, shape (n, 2)
+        n samples drawn from N(m, sigma).
     """
 
     generator = check_random_state(random_state)
@@ -86,11 +81,10 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None):
 
 
 def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
-    """ dataset generation for classification problems
+    """Dataset generation for classification problems
 
     Parameters
     ----------
-
     dataset : str
         type of classification problem (see code)
     n : int
@@ -105,13 +99,11 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
 
     Returns
     -------
-    X : np.array (n,d)
-          n observation of size d
-    y : np.array (n,)
-          labels of the samples
-
+    X : ndarray, shape (n, d)
+        n observation of size d
+    y : ndarray, shape (n,)
+        labels of the samples.
     """
-
     generator = check_random_state(random_state)
 
     if dataset.lower() == '3gauss':
diff --git a/ot/dr.py b/ot/dr.py
index d30ab30..680dabf 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -1,6 +1,12 @@
 # -*- coding: utf-8 -*-
 """
 Dimension reduction with optimal transport
+
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.dr`
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -43,30 +49,25 @@ def split_classes(X, y):
 
 
 def fda(X, y, p=2, reg=1e-16):
-    """
-    Fisher Discriminant Analysis
-
+    """Fisher Discriminant Analysis
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (ridge regularization)
 
-
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
+    proj : callable
         projection function including mean centering
-
-
     """
 
     mx = np.mean(X)
@@ -124,37 +125,33 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None):
 
     Parameters
     ----------
-    X : numpy.ndarray (n,d)
-        Training samples
-    y : np.ndarray (n,)
-        labels for training samples
+    X : ndarray, shape (n, d)
+        Training samples.
+    y : ndarray, shape (n,)
+        Labels for training samples.
     p : int, optional
-        size of dimensionnality reduction
+        Size of dimensionnality reduction.
     reg : float, optional
         Regularization term >0 (entropic regularization)
-    solver : str, optional
-        None for steepest decsent or 'TrustRegions' for trust regions algorithm
-        else shoudl be a pymanopt.solvers
-    P0 : numpy.ndarray (d,p)
-        Initial starting point for projection
+    solver : None | str, optional
+        None for steepest descent or 'TrustRegions' for trust regions algorithm
+        else should be a pymanopt.solvers
+    P0 : ndarray, shape (d, p)
+        Initial starting point for projection.
     verbose : int, optional
-        Print information along iterations
-
-
+        Print information along iterations.
 
     Returns
     -------
-    P : (d x p) ndarray
+    P : ndarray, shape (d, p)
         Optimal transportation matrix for the given parameters
-    proj : fun
-        projection function including mean centering
-
+    proj : callable
+        Projection function including mean centering.
 
     References
     ----------
-
-    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016). Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
-
+    .. [11] Flamary, R., Cuturi, M., Courty, N., & Rakotomamonjy, A. (2016).
+            Wasserstein Discriminant Analysis. arXiv preprint arXiv:1608.08063.
     """  # noqa
 
     mx = np.mean(X)
diff --git a/ot/externals/funcsigs.py b/ot/externals/funcsigs.py
index c73fdc9..106bde7 100644
--- a/ot/externals/funcsigs.py
+++ b/ot/externals/funcsigs.py
@@ -126,8 +126,8 @@ def signature(obj):
                 new_params[arg_name] = param.replace(default=arg_value,
                                                      _partial_kwarg=True)
 
-            elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL) and
-                  not param._partial_kwarg):
+            elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL)
+                  and not param._partial_kwarg):
                 new_params.pop(arg_name)
 
         return sig.replace(parameters=new_params.values())
@@ -333,11 +333,11 @@ class Parameter(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        return (issubclass(other.__class__, Parameter) and
-                self._name == other._name and
-                self._kind == other._kind and
-                self._default == other._default and
-                self._annotation == other._annotation)
+        return (issubclass(other.__class__, Parameter)
+                and self._name == other._name
+                and self._kind == other._kind
+                and self._default == other._default
+                and self._annotation == other._annotation)
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -372,8 +372,8 @@ class BoundArguments(object):
     def args(self):
         args = []
         for param_name, param in self._signature.parameters.items():
-            if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY) or
-                    param._partial_kwarg):
+            if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY)
+                    or param._partial_kwarg):
                 # Keyword arguments mapped by 'functools.partial'
                 # (Parameter._partial_kwarg is True) are mapped
                 # in 'BoundArguments.kwargs', along with VAR_KEYWORD &
@@ -402,8 +402,8 @@ class BoundArguments(object):
         kwargs_started = False
         for param_name, param in self._signature.parameters.items():
             if not kwargs_started:
-                if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY) or
-                        param._partial_kwarg):
+                if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY)
+                        or param._partial_kwarg):
                     kwargs_started = True
                 else:
                     if param_name not in self.arguments:
@@ -432,9 +432,9 @@ class BoundArguments(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        return (issubclass(other.__class__, BoundArguments) and
-                self.signature == other.signature and
-                self.arguments == other.arguments)
+        return (issubclass(other.__class__, BoundArguments)
+                and self.signature == other.signature
+                and self.arguments == other.arguments)
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -612,9 +612,9 @@ class Signature(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        if (not issubclass(type(other), Signature) or
-            self.return_annotation != other.return_annotation or
-                len(self.parameters) != len(other.parameters)):
+        if (not issubclass(type(other), Signature)
+            or self.return_annotation != other.return_annotation
+                or len(self.parameters) != len(other.parameters)):
             return False
 
         other_positions = dict((param, idx)
@@ -635,8 +635,8 @@ class Signature(object):
                 except KeyError:
                     return False
                 else:
-                    if (idx != other_idx or
-                            param != other.parameters[param_name]):
+                    if (idx != other_idx
+                            or param != other.parameters[param_name]):
                         return False
 
         return True
@@ -688,8 +688,8 @@ class Signature(object):
                             raise TypeError(msg)
                         parameters_ex = (param,)
                         break
-                    elif (param.kind == _VAR_KEYWORD or
-                          param.default is not _empty):
+                    elif (param.kind == _VAR_KEYWORD
+                          or param.default is not _empty):
                         # That's fine too - we have a default value for this
                         # parameter.  So, lets start parsing `kwargs`, starting
                         # with the current parameter
@@ -755,8 +755,8 @@ class Signature(object):
                 # if it has a default value, or it is an '*args'-like
                 # parameter, left alone by the processing of positional
                 # arguments.
-                if (not partial and param.kind != _VAR_POSITIONAL and
-                        param.default is _empty):
+                if (not partial and param.kind != _VAR_POSITIONAL
+                        and param.default is _empty):
                     raise TypeError('{arg!r} parameter lacking default value'.
                                     format(arg=param_name))
 
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index deda6b1..1ab95bb 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -5,11 +5,15 @@ This module provides GPU implementation for several OT solvers and utility
 functions. The GPU backend in handled by `cupy 
 <https://cupy.chainer.org/>`_.
 
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to 
+    use it you need to explicitely import :mod:`ot.gpu` .
+
 By default, the functions in this module accept and return numpy arrays 
 in order to proide drop-in replacement for the other POT function but
 the transfer between CPU en GPU comes with a significant overhead.
 
-In order to get the best erformances, we recommend to give only cupy 
+In order to get the best performances, we recommend to give only cupy 
 arrays to the functions and desactivate the conversion to numpy of the 
 result of the function with parameter ``to_numpy=False``.
 
diff --git a/ot/gpu/bregman.py b/ot/gpu/bregman.py
index 978b307..2e2df83 100644
--- a/ot/gpu/bregman.py
+++ b/ot/gpu/bregman.py
@@ -70,17 +70,6 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
     log : dict
         log dictionary return only if log==True in parameters
 
-    Examples
-    --------
-
-    >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
-    array([[ 0.36552929,  0.13447071],
-           [ 0.13447071,  0.36552929]])
-
 
     References
     ----------
diff --git a/ot/gromov.py b/ot/gromov.py
index 0278e99..9869341 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -1,26 +1,25 @@
-
 # -*- coding: utf-8 -*-
 """
 Gromov-Wasserstein transport method
-
-
 """
 
 # Author: Erwan Vautier <erwan.vautier@gmail.com>
 #         Nicolas Courty <ncourty@irisa.fr>
 #         Rémi Flamary <remi.flamary@unice.fr>
+#         Titouan Vayer <titouan.vayer@irisa.fr>
 #
 # License: MIT License
 
 import numpy as np
 
+
 from .bregman import sinkhorn
-from .utils import dist
+from .utils import dist, UndefinedParameter
 from .optim import cg
 
 
-def init_matrix(C1, C2, T, p, q, loss_fun='square_loss'):
-    """ Return loss matrices and tensors for Gromov-Wasserstein fast computation
+def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
+    """Return loss matrices and tensors for Gromov-Wasserstein fast computation
 
     Returns the value of \mathcal{L}(C1,C2) \otimes T with the selected loss
     function as the loss function of Gromow-Wasserstein discrepancy.
@@ -32,14 +31,14 @@ def init_matrix(C1, C2, T, p, q, loss_fun='square_loss'):
         * C2 : Metric cost matrix in the target space
         * T : A coupling between those two spaces
 
-    The square-loss function L(a,b)=(1/2)*|a-b|^2 is read as :
+    The square-loss function L(a,b)=|a-b|^2 is read as :
         L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
-            * f1(a)=(a^2)/2
-            * f2(b)=(b^2)/2
+            * f1(a)=(a^2)
+            * f2(b)=(b^2)
             * h1(a)=a
-            * h2(b)=b
+            * h2(b)=2*b
 
-    The kl-loss function L(a,b)=(1/2)*|a-b|^2 is read as :
+    The kl-loss function L(a,b)=a*log(a/b)-a+b is read as :
         L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
             * f1(a)=a*log(a)-a
             * f2(b)=b
@@ -49,44 +48,42 @@ def init_matrix(C1, C2, T, p, q, loss_fun='square_loss'):
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     T :  ndarray, shape (ns, nt)
-         Coupling between source and target spaces
+        Coupling between source and target spaces
     p : ndarray, shape (ns,)
 
-
     Returns
     -------
-
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
     if loss_fun == 'square_loss':
         def f1(a):
-            return (a**2) / 2
+            return (a**2)
 
         def f2(b):
-            return (b**2) / 2
+            return (b**2)
 
         def h1(a):
             return a
 
         def h2(b):
-            return b
+            return 2 * b
     elif loss_fun == 'kl_loss':
         def f1(a):
             return a * np.log(a + 1e-15) - a
@@ -112,31 +109,29 @@ def init_matrix(C1, C2, T, p, q, loss_fun='square_loss'):
 
 
 def tensor_product(constC, hC1, hC2, T):
-    """ Return the tensor for Gromov-Wasserstein fast computation
+    """Return the tensor for Gromov-Wasserstein fast computation
 
     The tensor is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
-
+        h2(C) matrix in Eq. (6)
 
     Returns
     -------
-
     tens : ndarray, shape (ns, nt)
-           \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
+        \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     A = -np.dot(hC1, T).dot(hC2.T)
@@ -146,32 +141,31 @@ def tensor_product(constC, hC1, hC2, T):
 
 
 def gwloss(constC, hC1, hC2, T):
-    """ Return the Loss for Gromov-Wasserstein
+    """Return the Loss for Gromov-Wasserstein
 
     The loss is computed as described in Proposition 1 Eq. (6) in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     loss : float
-           Gromov Wasserstein loss
+        Gromov Wasserstein loss
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -181,32 +175,31 @@ def gwloss(constC, hC1, hC2, T):
 
 
 def gwggrad(constC, hC1, hC2, T):
-    """ Return the gradient for Gromov-Wasserstein
+    """Return the gradient for Gromov-Wasserstein
 
     The gradient is computed as described in Proposition 2 in [12].
 
     Parameters
     ----------
     constC : ndarray, shape (ns, nt)
-           Constant C matrix in Eq. (6)
+        Constant C matrix in Eq. (6)
     hC1 : ndarray, shape (ns, ns)
-           h1(C1) matrix in Eq. (6)
+        h1(C1) matrix in Eq. (6)
     hC2 : ndarray, shape (nt, nt)
-           h2(C) matrix in Eq. (6)
+        h2(C) matrix in Eq. (6)
     T : ndarray, shape (ns, nt)
-           Current value of transport matrix T
+        Current value of transport matrix T
 
     Returns
     -------
-
     grad : ndarray, shape (ns, nt)
            Gromov Wasserstein gradient
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
     return 2 * tensor_product(constC, hC1, hC2,
@@ -220,19 +213,19 @@ def update_square_loss(p, lambdas, T, Cs):
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
-         masses in the targeted barycenter
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
     lambdas : list of float
-              list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (nt,nt)
-        updated C matrix
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
     """
     tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
                   for s in range(len(T))])
@@ -249,12 +242,12 @@ def update_kl_loss(p, lambdas, T, Cs):
     Parameters
     ----------
     p  : ndarray, shape (N,)
-         weights in the targeted barycenter
+        Weights in the targeted barycenter.
     lambdas : list of the S spaces' weights
-    T : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
+    T : list of S np.ndarray of shape (ns,N)
+        The S Ts couplings calculated at each iteration.
     Cs : list of S ndarray, shape(ns,ns)
-         Metric cost matrices
+        Metric cost matrices.
 
     Returns
     ----------
@@ -268,34 +261,33 @@ def update_kl_loss(p, lambdas, T, Cs):
     return np.exp(np.divide(tmpsum, ppt))
 
 
-def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, **kwargs):
+def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
     """
     Returns the gromov-wasserstein transport between (C1,p) and (C2,q)
 
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
-    q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str
         loss function used for the solver either 'square_loss' or 'kl_loss'
 
     max_iter : int, optional
@@ -306,16 +298,19 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, **kwargs):
         Print information along iterations
     log : bool, optional
         record log if True
+    armijo : bool, optional
+        If True the steps of the line-search is found via an armijo research. Else closed form is used.
+        If there is convergence issues use False.
     **kwargs : dict
-        parameters can be directly pased to the ot.optim.cg solver
+        parameters can be directly passed to the ot.optim.cg solver
 
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
+        Doupling between the two spaces that minimizes:
             \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
     log : dict
-        convergence information and loss
+        Convergence information and loss.
 
     References
     ----------
@@ -329,9 +324,7 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, **kwargs):
 
     """
 
-    T = np.eye(len(p), len(q))
-
-    constC, hC1, hC2 = init_matrix(C1, C2, T, p, q, loss_fun)
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     G0 = p[:, None] * q[None, :]
 
@@ -342,43 +335,41 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, **kwargs):
         return gwggrad(constC, hC1, hC2, G)
 
     if log:
-        res, log = cg(p, q, 0, 1, f, df, G0, log=True, **kwargs)
+        res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
         log['gw_dist'] = gwloss(constC, hC1, hC2, res)
         return res, log
     else:
-        return cg(p, q, 0, 1, f, df, G0, **kwargs)
+        return cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
 
 
-def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, **kwargs):
+def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
     """
     Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
 
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
-         distribution in the source space
+        Metric cost matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space.
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
+        Distribution in the target space.
+    loss_fun :  str
         loss function used for the solver either 'square_loss' or 'kl_loss'
-
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -387,6 +378,9 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, **kwargs):
         Print information along iterations
     log : bool, optional
         record log if True
+    armijo : bool, optional
+        If True the steps of the line-search is found via an armijo research. Else closed form is used.
+        If there is convergence issues use False.
 
     Returns
     -------
@@ -407,9 +401,88 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, **kwargs):
 
     """
 
-    T = np.eye(len(p), len(q))
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+
+    G0 = p[:, None] * q[None, :]
+
+    def f(G):
+        return gwloss(constC, hC1, hC2, G)
+
+    def df(G):
+        return gwggrad(constC, hC1, hC2, G)
+    res, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+    log_gw['gw_dist'] = gwloss(constC, hC1, hC2, res)
+    log_gw['T'] = res
+    if log:
+        return log_gw['gw_dist'], log_gw
+    else:
+        return log_gw['gw_dist']
+
+
+def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
+    """
+    Computes the FGW transport between two graphs see [24]
+
+    .. math::
+        \gamma = arg\min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+        s.t. \gamma 1 = p
+             \gamma^T 1= q
+             \gamma\geq 0
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`f` is the regularization term ( and df is its gradient)
+    - a and b are source and target weights (sum to 1)
+    - L is a loss function to account for the misfit between the similarity matrices
 
-    constC, hC1, hC2 = init_matrix(C1, C2, T, p, q, loss_fun)
+    The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
+
+    Parameters
+    ----------
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix representative of the structure in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric cost matrix representative of the structure in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    loss_fun : str, optional
+        Loss function used for the solver
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+    armijo : bool, optional
+        If True the steps of the line-search is found via an armijo research. Else closed form is used.
+        If there is convergence issues use False.
+    **kwargs : dict
+        parameters can be directly passed to the ot.optim.cg solver
+
+    Returns
+    -------
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
+    log : dict
+        Log dictionary return only if log==True in parameters.
+
+    References
+    ----------
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+        and Courty Nicolas "Optimal Transport for structured data with
+        application on graphs", International Conference on Machine Learning
+        (ICML). 2019.
+    """
+
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     G0 = p[:, None] * q[None, :]
 
@@ -418,13 +491,95 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, **kwargs):
 
     def df(G):
         return gwggrad(constC, hC1, hC2, G)
-    res, log = cg(p, q, 0, 1, f, df, G0, log=True, **kwargs)
-    log['gw_dist'] = gwloss(constC, hC1, hC2, res)
-    log['T'] = res
+
     if log:
-        return log['gw_dist'], log
+        res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+        log['fgw_dist'] = log['loss'][::-1][0]
+        return res, log
     else:
-        return log['gw_dist']
+        return cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+
+
+def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
+    """
+    Computes the FGW distance between two graphs see [24]
+
+    .. math::
+        \min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
+        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+
+        s.t. \gamma 1 = p
+             \gamma^T 1= q
+             \gamma\geq 0
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`f` is the regularization term ( and df is its gradient)
+    - a and b are source and target weights (sum to 1)
+    - L is a loss function to account for the misfit between the similarity matrices
+    The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
+    Parameters
+    ----------
+    M : ndarray, shape (ns, nt)
+        Metric cost matrix between features across domains
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix respresentative of the structure in the source space.
+    C2 : ndarray, shape (nt, nt)
+        Metric cost matrix espresentative of the structure in the target space.
+    p :  ndarray, shape (ns,)
+        Distribution in the source space.
+    q :  ndarray, shape (nt,)
+        Distribution in the target space.
+    loss_fun : str, optional
+        Loss function used for the solver.
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Record log if True.
+    armijo : bool, optional
+        If True the steps of the line-search is found via an armijo research.
+        Else closed form is used. If there is convergence issues use False.
+    **kwargs : dict
+        Parameters can be directly pased to the ot.optim.cg solver.
+
+    Returns
+    -------
+    gamma : ndarray, shape (ns, nt)
+        Optimal transportation matrix for the given parameters.
+    log : dict
+        Log dictionary return only if log==True in parameters.
+
+    References
+    ----------
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+        and Courty Nicolas
+        "Optimal Transport for structured data with application on graphs"
+        International Conference on Machine Learning (ICML). 2019.
+    """
+
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+
+    G0 = p[:, None] * q[None, :]
+
+    def f(G):
+        return gwloss(constC, hC1, hC2, G)
+
+    def df(G):
+        return gwggrad(constC, hC1, hC2, G)
+
+    res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+    if log:
+        log['fgw_dist'] = log['loss'][::-1][0]
+        log['T'] = res
+        return log['fgw_dist'], log
+    else:
+        return log['fgw_dist']
 
 
 def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
@@ -437,56 +592,55 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
-        s.t. \GW 1 = p
+        s.t. T 1 = p
 
-             \GW^T 1= q
+             T^T 1= q
 
-             \GW\geq 0
+             T\geq 0
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
+        Distribution in the target space
     loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
-       Max number of iterations
+        Max number of iterations
     tol : float, optional
         Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
     T : ndarray, shape (ns, nt)
-        coupling between the two spaces that minimizes :
-            \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        Optimal coupling between the two spaces
 
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
 
@@ -495,7 +649,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 
     T = np.outer(p, q)  # Initialization
 
-    constC, hC1, hC2 = init_matrix(C1, C2, T, p, q, loss_fun)
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
     cpt = 0
     err = 1
@@ -545,28 +699,28 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        \GW_Dist = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
 
     Where :
-        C1 : Metric cost matrix in the source space
-        C2 : Metric cost matrix in the target space
-        p  : distribution in the source space
-        q  : distribution in the target space
-        L  : loss function to account for the misfit between the similarity matrices
-        H  : entropy
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+    - H  : entropy
 
     Parameters
     ----------
     C1 : ndarray, shape (ns, ns)
-         Metric cost matrix in the source space
+        Metric cost matrix in the source space
     C2 : ndarray, shape (nt, nt)
-         Metric costfr matrix in the target space
+        Metric costfr matrix in the target space
     p :  ndarray, shape (ns,)
-         distribution in the source space
+        Distribution in the source space
     q :  ndarray, shape (nt,)
-         distribution in the target space
-    loss_fun :  string
-        loss function used for the solver either 'square_loss' or 'kl_loss'
+        Distribution in the target space
+    loss_fun : str
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -576,7 +730,7 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
-        record log if True
+        Record log if True.
 
     Returns
     -------
@@ -586,11 +740,10 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
-
     gw, logv = entropic_gromov_wasserstein(
         C1, C2, p, q, loss_fun, epsilon, max_iter, tol, verbose, log=True)
 
@@ -612,29 +765,31 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     The function solves the following optimization problem:
 
     .. math::
-        C = argmin_C\in R^{NxN} \sum_s \lambda_s GW(C,Cs,p,ps)
+        C = argmin_{C\in R^{NxN}} \sum_s \lambda_s GW(C,C_s,p,p_s)
 
 
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - :math:`C_s` : metric cost matrix
+    - :math:`p_s`  : distribution
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns,ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape(N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
-    loss_fun :  tensor-matrix multiplication function based on specific loss function
-    update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-             with the S Ts couplings calculated at each iteration
+        List of the S spaces' weights.
+    loss_fun : callable
+        Tensor-matrix multiplication function based on specific loss function.
+    update : callable
+        function(p,lambdas,T,Cs) that updates C according to a specific Kernel
+        with the S Ts couplings calculated at each iteration
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -642,11 +797,11 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     tol : float, optional
         Stop threshol on error (>0)
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape (N, N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -656,9 +811,8 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
-
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
     """
 
     S = len(Cs)
@@ -668,6 +822,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX use random state
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -679,7 +834,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
     error = []
 
-    while(err > tol and cpt < max_iter):
+    while (err > tol) and (cpt < max_iter):
         Cprev = C
 
         T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
@@ -723,37 +878,36 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     .. math::
         C = argmin_C\in R^NxN \sum_s \lambda_s GW(C,Cs,p,ps)
 
-
     Where :
 
-        Cs : metric cost matrix
-        ps  : distribution
+    - Cs : metric cost matrix
+    - ps  : distribution
 
     Parameters
     ----------
-    N  : Integer
-         Size of the targeted barycenter
-    Cs : list of S np.ndarray(ns,ns)
-         Metric cost matrices
-    ps : list of S np.ndarray(ns,)
-         sample weights in the S spaces
-    p  : ndarray, shape(N,)
-         weights in the targeted barycenter
+    N : int
+        Size of the targeted barycenter
+    Cs : list of S np.ndarray of shape (ns, ns)
+        Metric cost matrices
+    ps : list of S np.ndarray of shape (ns,)
+        Sample weights in the S spaces
+    p : ndarray, shape (N,)
+        Weights in the targeted barycenter
     lambdas : list of float
-              list of the S spaces' weights
+        List of the S spaces' weights
     loss_fun :  tensor-matrix multiplication function based on specific loss function
     update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
              with the S Ts couplings calculated at each iteration
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on error (>0).
     verbose : bool, optional
-        Print information along iterations
+        Print information along iterations.
     log : bool, optional
-        record log if True
-    init_C : bool, ndarray, shape(N,N)
-             random initial value for the C matrix provided by user
+        Record log if True.
+    init_C : bool | ndarray, shape(N,N)
+        Random initial value for the C matrix provided by user.
 
     Returns
     -------
@@ -763,11 +917,10 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     References
     ----------
     .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-    "Gromov-Wasserstein averaging of kernel and distance matrices."
-    International Conference on Machine Learning (ICML). 2016.
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
 
     """
-
     S = len(Cs)
 
     Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
@@ -775,6 +928,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
+        # XXX : should use a random state and not use the global seed
         xalea = np.random.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
@@ -815,3 +969,209 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         cpt += 1
 
     return C
+
+
+def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
+                    p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
+                    verbose=False, log=False, init_C=None, init_X=None):
+    """Compute the fgw barycenter as presented eq (5) in [24].
+
+    Parameters
+    ----------
+    N : integer
+        Desired number of samples of the target barycenter
+    Ys: list of ndarray, each element has shape (ns,d)
+        Features of all samples
+    Cs : list of ndarray, each element has shape (ns,ns)
+        Structure matrices of all samples
+    ps : list of ndarray, each element has shape (ns,)
+        Masses of all samples.
+    lambdas : list of float
+        List of the S spaces' weights
+    alpha : float
+        Alpha parameter for the fgw distance
+    fixed_structure : bool
+        Whether to fix the structure of the barycenter during the updates
+    fixed_features : bool
+        Whether to fix the feature of the barycenter during the updates
+    init_C : ndarray, shape (N,N), optional
+        Initialization for the barycenters' structure matrix. If not set
+        a random init is used.
+    init_X : ndarray, shape (N,d), optional
+        Initialization for the barycenters' features. If not set a
+        random init is used.
+
+    Returns
+    -------
+    X : ndarray, shape (N, d)
+        Barycenters' features
+    C : ndarray, shape (N, N)
+        Barycenters' structure matrix
+    log_: dict
+        Only returned when log=True. It contains the keys:
+        T : list of (N,ns) transport matrices
+        Ms : all distance matrices between the feature of the barycenter and the
+        other features dist(X,Ys) shape (N,ns)
+
+    References
+    ----------
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+        and Courty Nicolas
+        "Optimal Transport for structured data with application on graphs"
+        International Conference on Machine Learning (ICML). 2019.
+    """
+    S = len(Cs)
+    d = Ys[0].shape[1]  # dimension on the node features
+    if p is None:
+        p = np.ones(N) / N
+
+    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
+    Ys = [np.asarray(Ys[s], dtype=np.float64) for s in range(S)]
+
+    lambdas = np.asarray(lambdas, dtype=np.float64)
+
+    if fixed_structure:
+        if init_C is None:
+            raise UndefinedParameter('If C is fixed it must be initialized')
+        else:
+            C = init_C
+    else:
+        if init_C is None:
+            xalea = np.random.randn(N, 2)
+            C = dist(xalea, xalea)
+        else:
+            C = init_C
+
+    if fixed_features:
+        if init_X is None:
+            raise UndefinedParameter('If X is fixed it must be initialized')
+        else:
+            X = init_X
+    else:
+        if init_X is None:
+            X = np.zeros((N, d))
+        else:
+            X = init_X
+
+    T = [np.outer(p, q) for q in ps]
+
+    Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]  # Ms is N,ns
+
+    cpt = 0
+    err_feature = 1
+    err_structure = 1
+
+    if log:
+        log_ = {}
+        log_['err_feature'] = []
+        log_['err_structure'] = []
+        log_['Ts_iter'] = []
+
+    while((err_feature > tol or err_structure > tol) and cpt < max_iter):
+        Cprev = C
+        Xprev = X
+
+        if not fixed_features:
+            Ys_temp = [y.T for y in Ys]
+            X = update_feature_matrix(lambdas, Ys_temp, T, p).T
+
+        Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]
+
+        if not fixed_structure:
+            if loss_fun == 'square_loss':
+                T_temp = [t.T for t in T]
+                C = update_sructure_matrix(p, lambdas, T_temp, Cs)
+
+        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
+                                      numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
+
+        # T is N,ns
+        err_feature = np.linalg.norm(X - Xprev.reshape(N, d))
+        err_structure = np.linalg.norm(C - Cprev)
+
+        if log:
+            log_['err_feature'].append(err_feature)
+            log_['err_structure'].append(err_structure)
+            log_['Ts_iter'].append(T)
+
+        if verbose:
+            if cpt % 200 == 0:
+                print('{:5s}|{:12s}'.format(
+                    'It.', 'Err') + '\n' + '-' * 19)
+            print('{:5d}|{:8e}|'.format(cpt, err_structure))
+            print('{:5d}|{:8e}|'.format(cpt, err_feature))
+
+        cpt += 1
+
+    if log:
+        log_['T'] = T  # from target to Ys
+        log_['p'] = p
+        log_['Ms'] = Ms
+
+    if log:
+        return X, C, log_
+    else:
+        return X, C
+
+
+def update_sructure_matrix(p, lambdas, T, Cs):
+    """Updates C according to the L2 Loss kernel with the S Ts couplings.
+
+    It is calculated at each iteration
+
+    Parameters
+    ----------
+    p : ndarray, shape (N,)
+        Masses in the targeted barycenter.
+    lambdas : list of float
+        List of the S spaces' weights.
+    T : list of S ndarray of shape (ns, N)
+        The S Ts couplings calculated at each iteration.
+    Cs : list of S ndarray, shape (ns, ns)
+         Metric cost matrices.
+
+    Returns
+    -------
+    C : ndarray, shape (nt, nt)
+        Updated C matrix.
+    """
+    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s]) for s in range(len(T))])
+    ppt = np.outer(p, p)
+
+    return np.divide(tmpsum, ppt)
+
+
+def update_feature_matrix(lambdas, Ys, Ts, p):
+    """Updates the feature with respect to the S Ts couplings.
+
+
+    See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
+    in [24] calculated at each iteration
+
+    Parameters
+    ----------
+    p : ndarray, shape (N,)
+        masses in the targeted barycenter
+    lambdas : list of float
+        List of the S spaces' weights
+    Ts : list of S np.ndarray(ns,N)
+        the S Ts couplings calculated at each iteration
+    Ys : list of S ndarray, shape(d,ns)
+        The features.
+
+    Returns
+    -------
+    X : ndarray, shape (d, N)
+
+    References
+    ----------
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+          and Courty Nicolas
+        "Optimal Transport for structured data with application on graphs"
+        International Conference on Machine Learning (ICML). 2019.
+    """
+    p = np.array(1. / p).reshape(-1,)
+
+    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts))])
+
+    return tmpsum
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index f42e222..2adaace 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -32,4 +32,9 @@ enum ProblemType {
 
 int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter);
 
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, 
+                    long *iG, long *jG, double *G, long * nG,
+                    double* alpha, double* beta, double *cost, int maxIter);
+
+
 #endif
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index fc7ca63..28e4af2 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -17,13 +17,13 @@
 
 int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
                 double* alpha, double* beta, double *cost, int maxIter)  {
-// beware M and C anre strored in row major C style!!!
-    int n, m, i, cur;
+    // beware M and C anre strored in row major C style!!!
+     int n, m, i, cur;
 
     typedef FullBipartiteDigraph Digraph;
-  DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+    DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
 
-  // Get the number of non zero coordinates for r and c
+    // Get the number of non zero coordinates for r and c
     n=0;
     for (int i=0; i<n1; i++) {
         double val=*(X+i);
@@ -105,3 +105,186 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
 
     return ret;
 }
+
+
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, 
+                    long *iG, long *jG, double *G, long * nG,
+                    double* alpha, double* beta, double *cost, int maxIter)  {
+    // beware M and C anre strored in row major C style!!!
+
+    // Get the number of non zero coordinates for r and c and vectors
+    int n, m, i, cur;
+
+    typedef FullBipartiteDigraph Digraph;
+    DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+    // Get the number of non zero coordinates for r and c
+    n=0;
+    for (int i=0; i<n1; i++) {
+        double val=*(X+i);
+        if (val>0) {
+            n++;
+        }else if(val<0){
+			return INFEASIBLE;
+		}
+    }
+    m=0;
+    for (int i=0; i<n2; i++) {
+        double val=*(Y+i);
+        if (val>0) {
+            m++;
+        }else if(val<0){
+			return INFEASIBLE;
+		}
+    }
+
+    // Define the graph
+
+    std::vector<int> indI(n), indJ(m);
+    std::vector<double> weights1(n), weights2(m);
+    Digraph di(n, m);
+    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+    // Set supply and demand, don't account for 0 values (faster)
+
+    cur=0;
+    for (int i=0; i<n1; i++) {
+        double val=*(X+i);
+        if (val>0) {
+            weights1[ cur ] = val;
+            indI[cur++]=i;
+        }
+    }
+
+    // Demand is actually negative supply...
+
+    cur=0;
+    for (int i=0; i<n2; i++) {
+        double val=*(Y+i);
+        if (val>0) {
+            weights2[ cur ] = -val;
+            indJ[cur++]=i;
+        }
+    }
+
+    // Define the graph
+    net.supplyMap(&weights1[0], n, &weights2[0], m);
+
+    // Set the cost of each edge
+    for (int i=0; i<n; i++) {
+        for (int j=0; j<m; j++) {
+            double val=*(D+indI[i]*n2+indJ[j]);
+            net.setCost(di.arcFromId(i*m+j), val);
+        }
+    }
+
+
+    // Solve the problem with the network simplex algorithm
+
+    int ret=net.run();
+    if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+        *cost = 0;
+        Arc a; di.first(a);
+        cur=0;
+        for (; a != INVALID; di.next(a)) {
+            int i = di.source(a);
+            int j = di.target(a);
+            double flow = net.flow(a);
+            if (flow>0)
+            {
+                *cost += flow * (*(D+indI[i]*n2+indJ[j-n]));
+
+                *(G+cur) = flow;
+                *(iG+cur) = indI[i];
+                *(jG+cur) = indJ[j-n];
+                *(alpha + indI[i]) = -net.potential(i);
+                *(beta + indJ[j-n]) = net.potential(j);
+                cur++;
+            }
+        }
+        *nG=cur; // nb of value +1 for numpy indexing
+
+    }
+
+
+    return ret;
+}
+
+int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, 
+                    long *iD, long *jD, double *D, long  nD,
+                    long *iG, long *jG, double *G, long * nG,
+                    double* alpha, double* beta, double *cost, int maxIter)  {
+    // beware M and C anre strored in row major C style!!!
+
+    // Get the number of non zero coordinates for r and c and vectors
+    int n, m, cur;
+
+    typedef FullBipartiteDigraph Digraph;
+    DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+    n=n1;
+    m=n2;
+
+
+    // Define the graph
+
+
+    std::vector<double>  weights2(m);
+    Digraph di(n, m);
+    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+    // Set supply and demand, don't account for 0 values (faster)
+
+
+    // Demand is actually negative supply...
+
+    cur=0;
+    for (int i=0; i<n2; i++) {
+        double val=*(Y+i);
+        if (val>0) {
+            weights2[ cur ] = -val;
+        }
+    }
+
+    // Define the graph
+    net.supplyMap(X, n, &weights2[0], m);
+
+    // Set the cost of each edge
+    for (int k=0; k<nD; k++) {
+            int i = iD[k];
+            int j = jD[k];
+            net.setCost(di.arcFromId(i*m+j), D[k]);
+        
+    }
+
+
+    // Solve the problem with the network simplex algorithm
+
+    int ret=net.run();
+    if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+        *cost = net.totalCost();
+        Arc a; di.first(a);
+        cur=0;
+        for (; a != INVALID; di.next(a)) {
+            int i = di.source(a);
+            int j = di.target(a);
+            double flow = net.flow(a);
+            if (flow>0)
+            {
+                
+                *(G+cur) = flow;
+                *(iG+cur) = i;
+                *(jG+cur) = j-n;
+                *(alpha + i) = -net.potential(i);
+                *(beta + j-n) = net.potential(j);
+                cur++;
+            }
+        }
+        *nG=cur; // nb of value +1 for numpy indexing
+
+    }
+
+
+    return ret;
+}
+
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 02cbd8c..cdd505d 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
 Solvers for the original linear program OT problem
+
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -8,22 +11,171 @@ Solvers for the original linear program OT problem
 # License: MIT License
 
 import multiprocessing
-
+import sys
 import numpy as np
+from scipy.sparse import coo_matrix
 
 from .import cvx
 
 # import compiled emd
-from .emd_wrap import emd_c, check_result
+from .emd_wrap import emd_c, check_result, emd_1d_sorted
 from ..utils import parmap
 from .cvx import barycenter
 from ..utils import dist
 
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
+__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+           'emd_1d', 'emd2_1d', 'wasserstein_1d']
 
 
-def emd(a, b, M, numItermax=100000, log=False):
-    """Solves the Earth Movers distance problem and returns the OT matrix
+def center_ot_dual(alpha0, beta0, a=None, b=None):
+    r"""Center dual OT potentials w.r.t. theirs weights
+
+    The main idea of this function is to find unique dual potentials
+    that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having
+    stability when multiple calling of the OT solver with small changes.
+
+    Basically we add another constraint to the potential that will not
+    change the objective value but will ensure unicity. The constraint
+    is the following:
+
+    .. math::
+        \alpha^T a= \beta^T b
+
+    in addition to the OT problem constraints.
+
+    since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing
+    a constant from both  :math:`\alpha_0` and :math:`\beta_0`.
+
+    .. math::
+        c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta}
+
+        \alpha=\alpha_0+c
+
+        \beta=\beta0+c
+
+    Parameters
+    ----------
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+
+    Returns
+    -------
+    alpha : (ns,) numpy.ndarray, float64
+        Source centered dual potential
+    beta : (nt,) numpy.ndarray, float64
+        Target centered dual potential
+
+    """
+    # if no weights are provided, use uniform
+    if a is None:
+        a = np.ones(alpha0.shape[0]) / alpha0.shape[0]
+    if b is None:
+        b = np.ones(beta0.shape[0]) / beta0.shape[0]
+
+    # compute constant that balances the weighted sums of the duals
+    c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum())
+
+    # update duals
+    alpha = alpha0 + c
+    beta = beta0 - c
+
+    return alpha, beta
+
+
+def estimate_dual_null_weights(alpha0, beta0, a, b, M):
+    r"""Estimate feasible values for 0-weighted dual potentials
+
+    The feasible values are computed efficiently but rather coarsely.
+
+    .. warning::
+        This function is necessary because the C++ solver in emd_c
+        discards all samples in the distributions with 
+        zeros weights. This means that while the primal variable (transport 
+        matrix) is exact, the solver only returns feasible dual potentials
+        on the samples with weights different from zero. 
+
+    First we compute the constraints violations:
+
+    .. math::
+        V=\alpha+\beta^T-M
+
+    Next we compute the max amount of violation per row (alpha) and
+    columns (beta)
+
+    .. math::
+        v^a_i=\max_j V_{i,j}
+
+        v^b_j=\max_i V_{i,j}
+
+    Finally we update the dual potential with 0 weights if a
+    constraint is violated
+
+    .. math::
+        \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0
+
+        \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0
+
+    In the end the dual potentials are centered using function
+    :ref:`center_ot_dual`.
+
+    Note that all those updates do not change the objective value of the
+    solution but provide dual potentials that do not violate the constraints.
+
+    Parameters
+    ----------
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    a : (ns,) numpy.ndarray, float64
+        Source distribution (uniform weights if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target distribution (uniform weights if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
+
+    Returns
+    -------
+    alpha : (ns,) numpy.ndarray, float64
+        Source corrected dual potential
+    beta : (nt,) numpy.ndarray, float64
+        Target corrected dual potential
+
+    """
+
+    # binary indexing of non-zeros weights
+    asel = a != 0
+    bsel = b != 0
+
+    # compute dual constraints violation
+    constraint_violation = alpha0[:, None] + beta0[None, :] - M
+
+    # Compute largest violation per line and columns
+    aviol = np.max(constraint_violation, 1)
+    bviol = np.max(constraint_violation, 0)
+
+    # update corrects violation of
+    alpha_up = -1 * ~asel * np.maximum(aviol, 0)
+    beta_up = -1 * ~bsel * np.maximum(bviol, 0)
+
+    alpha = alpha0 + alpha_up
+    beta = beta0 + beta_up
+
+    return center_ot_dual(alpha, beta, a, b)
+
+
+def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True):
+    r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
     .. math::
@@ -37,26 +189,37 @@ def emd(a, b, M, numItermax=100000, log=False):
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
-    log: boolean, optional (default=False)
+    log: bool, optional (default=False)
         If True, returns a dictionary containing the cost and dual
         variables. Otherwise returns only the optimal transportation matrix.
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format.
+    center_dual: boolean, optional (default=True)
+        If True, centers the dual potential using function
+        :ref:`center_ot_dual`.
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
     log: dict
         If input log is true, a dictionary containing the cost and dual
@@ -74,8 +237,8 @@ def emd(a, b, M, numItermax=100000, log=False):
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.emd(a,b,M)
-    array([[ 0.5,  0. ],
-           [ 0. ,  0.5]])
+    array([[0.5, 0. ],
+           [0. , 0.5]])
 
     References
     ----------
@@ -94,13 +257,37 @@ def emd(a, b, M, numItermax=100000, log=False):
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
-    G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+    assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+        "Dimension mismatch, check dimensions of M with a and b"
+
+    asel = a != 0
+    bsel = b != 0
+
+    if dense:
+        G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+
+        if center_dual:
+            u, v = center_ot_dual(u, v, a, b)
+
+        if np.any(~asel) or np.any(~bsel):
+            u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+    else:
+        Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+        G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+        if center_dual:
+            u, v = center_ot_dual(u, v, a, b)
+
+        if np.any(~asel) or np.any(~bsel):
+            u, v = estimate_dual_null_weights(u, v, a, b, M)
+
     result_code_string = check_result(result_code)
     if log:
         log = {}
@@ -114,8 +301,9 @@ def emd(a, b, M, numItermax=100000, log=False):
 
 
 def emd2(a, b, M, processes=multiprocessing.cpu_count(),
-         numItermax=100000, log=False, return_matrix=False):
-    """Solves the Earth Movers distance problem and returns the loss
+         numItermax=100000, log=False, dense=True, return_matrix=False,
+         center_dual=True):
+    r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F
@@ -128,16 +316,22 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
+    processes : int, optional (default=nb cpu)
+        Nb of processes used for multiple emd computation (not used on windows)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
@@ -146,12 +340,19 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
         variables. Otherwise returns only the optimal transportation cost.
     return_matrix: boolean, optional (default=False)
         If True, returns the optimal transportation matrix in the log.
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format.
+    center_dual: boolean, optional (default=True)
+        If True, centers the dual potential using function
+        :ref:`center_ot_dual`.
 
     Returns
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
-    log: dict
+    log: dictnp
         If input log is true, a dictionary containing the cost and dual
         variables and exit status
 
@@ -187,27 +388,61 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # problem with pikling Forks
+    if sys.platform.endswith('win32'):
+        processes = 1
+
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
+    assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+        "Dimension mismatch, check dimensions of M with a and b"
+
+    asel = a != 0
+
     if log or return_matrix:
         def f(b):
-            G, cost, u, v, resultCode = emd_c(a, b, M, numItermax)
-            result_code_string = check_result(resultCode)
+            bsel = b != 0
+            if dense:
+                G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+            else:
+                Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+                G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+            if center_dual:
+                u, v = center_ot_dual(u, v, a, b)
+
+            if np.any(~asel) or np.any(~bsel):
+                u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+            result_code_string = check_result(result_code)
             log = {}
             if return_matrix:
                 log['G'] = G
             log['u'] = u
             log['v'] = v
             log['warning'] = result_code_string
-            log['result_code'] = resultCode
+            log['result_code'] = result_code
             return [cost, log]
     else:
         def f(b):
-            G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+            bsel = b != 0
+            if dense:
+                G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+            else:
+                Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+                G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+            if center_dual:
+                u, v = center_ot_dual(u, v, a, b)
+
+            if np.any(~asel) or np.any(~bsel):
+                u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+            result_code_string = check_result(result_code)
             check_result(result_code)
             return cost
 
@@ -215,9 +450,12 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
         return f(b)
     nb = b.shape[1]
 
-    res = parmap(f, [b[:, i] for i in range(nb)], processes)
-    return res
+    if processes > 1:
+        res = parmap(f, [b[:, i] for i in range(nb)], processes)
+    else:
+        res = list(map(f, [b[:, i].copy() for i in range(nb)]))
 
+    return res
 
 
 def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None):
@@ -231,9 +469,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     Parameters
     ----------
-    measures_locations : list of (k_i,d) np.ndarray
+    measures_locations : list of (k_i,d) numpy.ndarray
         The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
-    measures_weights : list of (k_i,) np.ndarray
+    measures_weights : list of (k_i,) numpy.ndarray
         Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
@@ -246,7 +484,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -272,7 +510,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     k = X_init.shape[0]
     d = X_init.shape[1]
     if b is None:
-        b = np.ones((k,))/k
+        b = np.ones((k,)) / k
     if weights is None:
         weights = np.ones((N,)) / N
 
@@ -283,7 +521,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     displacement_square_norm = stopThr + 1.
 
-    while ( displacement_square_norm > stopThr and iter_count < numItermax ):
+    while (displacement_square_norm > stopThr and iter_count < numItermax):
 
         T_sum = np.zeros((k, d))
 
@@ -293,7 +531,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
             T_i = emd(b, measure_weights_i, M_i)
             T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
 
-        displacement_square_norm = np.sum(np.square(T_sum-X))
+        displacement_square_norm = np.sum(np.square(T_sum - X))
         if log:
             displacement_square_norms.append(displacement_square_norm)
 
@@ -308,4 +546,288 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         log_dict['displacement_square_norms'] = displacement_square_norms
         return X, log_dict
     else:
-        return X
-\ No newline at end of file
+        return X
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+           log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the OT matrix
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the cost.
+        Otherwise returns only the optimal transportation matrix.
+
+    Returns
+    -------
+    gamma: (ns, nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log: dict
+        If input log is True, a dictionary containing the cost
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd_1d(x_a, x_b, a, b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+    >>> ot.emd_1d(x_a, x_b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd : EMD for multidimensional distributions
+    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+        transportation matrix)
+    """
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    x_a = np.asarray(x_a, dtype=np.float64)
+    x_b = np.asarray(x_b, dtype=np.float64)
+
+    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+
+    # if empty array given then use uniform distributions
+    if a.ndim == 0 or len(a) == 0:
+        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
+    if b.ndim == 0 or len(b) == 0:
+        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+
+    x_a_1d = x_a.reshape((-1, ))
+    x_b_1d = x_b.reshape((-1, ))
+    perm_a = np.argsort(x_a_1d)
+    perm_b = np.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(a, b,
+                                            x_a_1d[perm_a], x_b_1d[perm_b],
+                                            metric=metric, p=p)
+    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
+                   shape=(a.shape[0], b.shape[0]))
+    if dense:
+        G = G.toarray()
+    if log:
+        log = {'cost': cost}
+        return G, log
+    return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+            log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the loss
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Only used if log is set to True. Due to implementation details,
+        this function runs faster when dense is set to False.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the transportation matrix.
+        Otherwise returns only the loss.
+
+    Returns
+    -------
+    loss: float
+        Cost associated to the optimal transportation
+    log: dict
+        If input log is True, a dictionary containing the Optimal transportation
+        matrix for the given parameters
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd2_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd2_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.emd2_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd2 : EMD for multidimensional distributions
+    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+        instead of the cost)
+    """
+    # If we do not return G (log==False), then we should not to cast it to dense
+    # (useless overhead)
+    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+                        dense=dense and log, log=True)
+    cost = log_emd['cost']
+    if log:
+        log_emd = {'G': G}
+        return cost, log_emd
+    return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
+    r"""Solves the p-Wasserstein distance problem between 1d measures and returns
+    the distance
+
+    .. math::
+        \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+
+    where :
+
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    p: float, optional (default=1.0)
+         The order of the p-Wasserstein distance to be computed
+
+    Returns
+    -------
+    dist: float
+        p-Wasserstein distance
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function wasserstein_1d accepts
+    lists and performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.wasserstein_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.wasserstein_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd_1d : EMD for 1d distributions
+    """
+    cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+                       dense=False, log=False)
+    return np.power(cost_emd, 1. / p)
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 83ee6aa..d345fd4 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -10,13 +10,19 @@ Cython linker with C solver
 import numpy as np
 cimport numpy as np
 
+from ..utils import dist
+
 cimport cython
+cimport libc.math as math
 
 import warnings
 
 
 cdef extern from "EMD.h":
     int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter)
+    int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, 
+                    long *iG, long *jG, double *G, long * nG,
+                    double* alpha, double* beta, double *cost, int maxIter)
     cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED
 
 
@@ -34,9 +40,11 @@ def check_result(result_code):
     return message
 
 
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"]  b, np.ndarray[double, ndim=2, mode="c"]  M, int max_iter):
+def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"]  b, np.ndarray[double, ndim=2, mode="c"]  M, int max_iter, bint dense):
     """
         Solves the Earth Movers distance problem and returns the optimal transport matrix
 
@@ -55,33 +63,50 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order :py.cls:`numpy.array`
+
+    .. warning::
+        The C++ solver discards all samples in the distributions with 
+        zeros weights. This means that while the primal variable (transport 
+        matrix) is exact, the solver only returns feasible dual potentials
+        on the samples with weights different from zero. 
+
     Parameters
     ----------
-    a : (ns,) ndarray, float64
+    a : (ns,) numpy.ndarray, float64
         source histogram
-    b : (nt,) ndarray, float64
+    b : (nt,) numpy.ndarray, float64
         target histogram
-    M : (ns,nt) ndarray, float64
+    M : (ns,nt) numpy.ndarray, float64
         loss matrix
     max_iter : int
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
-
+    dense : bool
+        Return a sparse transport matrix if set to False
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
 
     """
     cdef int n1= M.shape[0]
     cdef int n2= M.shape[1]
+    cdef int nmax=n1+n2-1
+    cdef int result_code = 0
+    cdef int nG=0
 
     cdef double cost=0
-    cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2])
     cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1)
     cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2)
 
+    cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0])
+
+    cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0)
+    cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int)
+    cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int)
 
     if not len(a):
         a=np.ones((n1,))/n1
@@ -89,7 +114,112 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     if not len(b):
         b=np.ones((n2,))/n2
 
-    # calling the function
-    cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+    if dense:
+        # init OT matrix
+        G=np.zeros([n1, n2])
+
+        # calling the function
+        result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+
+        return G, cost, alpha, beta, result_code
+
+
+    else:
+        
+        # init sparse OT matrix
+        Gv=np.zeros(nmax)
+        iG=np.zeros(nmax,dtype=np.int)
+        jG=np.zeros(nmax,dtype=np.int)
+
+
+        result_code = EMD_wrap_return_sparse(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <long*> iG.data, <long*> jG.data, <double*> Gv.data, <long*> &nG, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
 
-    return G, cost, alpha, beta, result_code
+
+        return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code
+
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
+                  np.ndarray[double, ndim=1, mode="c"] v_weights,
+                  np.ndarray[double, ndim=1, mode="c"] u,
+                  np.ndarray[double, ndim=1, mode="c"] v,
+                  str metric='sqeuclidean',
+                  double p=1.):
+    r"""
+    Solves the Earth Movers distance problem between sorted 1d measures and
+    returns the OT matrix and the associated cost
+
+    Parameters
+    ----------
+    u_weights : (ns,) ndarray, float64
+        Source histogram
+    v_weights : (nt,) ndarray, float64
+        Target histogram
+    u : (ns,) ndarray, float64
+        Source dirac locations (on the real line)
+    v : (nt,) ndarray, float64
+        Target dirac locations (on the real line)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+
+    Returns
+    -------
+    gamma: (n, ) ndarray, float64
+        Values in the Optimal transportation matrix
+    indices: (n, 2) ndarray, int64
+        Indices of the values stored in gamma for the Optimal transportation
+        matrix
+    cost
+        cost associated to the optimal transportation
+    """
+    cdef double cost = 0.
+    cdef int n = u_weights.shape[0]
+    cdef int m = v_weights.shape[0]
+
+    cdef int i = 0
+    cdef double w_i = u_weights[0]
+    cdef int j = 0
+    cdef double w_j = v_weights[0]
+
+    cdef double m_ij = 0.
+
+    cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ),
+                                                           dtype=np.float64)
+    cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
+                                                              dtype=np.int)
+    cdef int cur_idx = 0
+    while i < n and j < m:
+        if metric == 'sqeuclidean':
+            m_ij = (u[i] - v[j]) * (u[i] - v[j])
+        elif metric == 'cityblock' or metric == 'euclidean':
+            m_ij = math.fabs(u[i] - v[j])
+        elif metric == 'minkowski':
+            m_ij = math.pow(math.fabs(u[i] - v[j]), p)
+        else:
+            m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)),
+                        metric=metric)[0, 0]
+        if w_i < w_j or j == m - 1:
+            cost += m_ij * w_i
+            G[cur_idx] = w_i
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            i += 1
+            w_j -= w_i
+            w_i = u_weights[i]
+        else:
+            cost += m_ij * w_j
+            G[cur_idx] = w_j
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            j += 1
+            w_i -= w_j
+            w_j = v_weights[j]
+        cur_idx += 1
+    return G[:cur_idx], indices[:cur_idx], cost
diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h
index 7c6a4ce..498e921 100644
--- a/ot/lp/network_simplex_simple.h
+++ b/ot/lp/network_simplex_simple.h
@@ -686,7 +686,7 @@ namespace lemon {
         /// \see resetParams(), reset()
         ProblemType run() {
 #if DEBUG_LVL>0
-            std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n";
+            std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ;
 #endif
 
             if (!init()) return INFEASIBLE;
diff --git a/ot/optim.py b/ot/optim.py
index f31fae2..4012e0d 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -4,6 +4,7 @@ Optimization algorithms for OT
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
+#         Titouan Vayer <titouan.vayer@irisa.fr>
 #
 # License: MIT License
 
@@ -25,14 +26,13 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
 
     Parameters
     ----------
-
-    f : function
+    f : callable
         loss function
-    xk : np.ndarray
+    xk : ndarray
         initial position
-    pk : np.ndarray
+    pk : ndarray
         descent direction
-    gfk : np.ndarray
+    gfk : ndarray
         gradient of f at xk
     old_fval : float
         loss value at xk
@@ -72,8 +72,70 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
     return alpha, fc[0], phi1
 
 
-def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
-       stopThr=1e-9, verbose=False, log=False):
+def solve_linesearch(cost, G, deltaG, Mi, f_val,
+                     armijo=True, C1=None, C2=None, reg=None, Gc=None, constC=None, M=None):
+    """
+    Solve the linesearch in the FW iterations
+    Parameters
+    ----------
+    cost : method
+        Cost in the FW for the linesearch
+    G : ndarray, shape(ns,nt)
+        The transport map at a given iteration of the FW
+    deltaG : ndarray (ns,nt)
+        Difference between the optimal map found by linearization in the FW algorithm and the value at a given iteration
+    Mi : ndarray (ns,nt)
+        Cost matrix of the linearized transport problem. Corresponds to the gradient of the cost
+    f_val :  float
+        Value of the cost at G
+    armijo : bool, optional
+            If True the steps of the line-search is found via an armijo research. Else closed form is used.
+            If there is convergence issues use False.
+    C1 : ndarray (ns,ns), optional
+        Structure matrix in the source domain. Only used and necessary when armijo=False
+    C2 : ndarray (nt,nt), optional
+        Structure matrix in the target domain. Only used and necessary when armijo=False
+    reg : float, optional
+          Regularization parameter. Only used and necessary when armijo=False
+    Gc : ndarray (ns,nt)
+        Optimal map found by linearization in the FW algorithm. Only used and necessary when armijo=False
+    constC : ndarray (ns,nt)
+             Constant for the gromov cost. See [24]. Only used and necessary when armijo=False
+    M : ndarray (ns,nt), optional
+        Cost matrix between the features. Only used and necessary when armijo=False
+    Returns
+    -------
+    alpha : float
+            The optimal step size of the FW
+    fc : int
+         nb of function call. Useless here
+    f_val :  float
+             The value of the cost for the next iteration
+    References
+    ----------
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+          and Courty Nicolas
+        "Optimal Transport for structured data with application on graphs"
+        International Conference on Machine Learning (ICML). 2019.
+    """
+    if armijo:
+        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, Mi, f_val)
+    else:  # requires symetric matrices
+        dot1 = np.dot(C1, deltaG)
+        dot12 = dot1.dot(C2)
+        a = -2 * reg * np.sum(dot12 * deltaG)
+        b = np.sum((M + reg * constC) * deltaG) - 2 * reg * (np.sum(dot12 * G) + np.sum(np.dot(C1, G).dot(C2) * deltaG))
+        c = cost(G)
+
+        alpha = solve_1d_linesearch_quad(a, b, c)
+        fc = None
+        f_val = cost(G + alpha * deltaG)
+
+    return alpha, fc, f_val
+
+
+def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000,
+       stopThr=1e-9, stopThr2=1e-9, verbose=False, log=False, **kwargs):
     """
     Solve the general regularized OT problem with conditional gradient
 
@@ -98,24 +160,30 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarray, shape (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg : float
         Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 :  ndarray, shape (ns,nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
+    numItermaxEmd : int, optional
+        Max number of iterations for emd
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on the relative variation (>0)
+    stopThr2 : float, optional
+        Stop threshol on the absolute variation (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
         record log if True
+    **kwargs : dict
+             Parameters for linesearch
 
     Returns
     -------
@@ -157,9 +225,9 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
     it = 0
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format(
-            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
-        print('{:5d}|{:8e}|{:8e}'.format(it, f_val, 0))
+        print('{:5s}|{:12s}|{:8s}|{:8s}'.format(
+            'It.', 'Loss', 'Relative loss', 'Absolute loss') + '\n' + '-' * 48)
+        print('{:5d}|{:8e}|{:8e}|{:8e}'.format(it, f_val, 0, 0))
 
     while loop:
 
@@ -172,12 +240,12 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
         Mi += Mi.min()
 
         # solve linear program
-        Gc = emd(a, b, Mi)
+        Gc = emd(a, b, Mi, numItermax=numItermaxEmd)
 
         deltaG = Gc - G
 
         # line search
-        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, Mi, f_val)
+        alpha, fc, f_val = solve_linesearch(cost, G, deltaG, Mi, f_val, reg=reg, M=M, Gc=Gc, **kwargs)
 
         G = G + alpha * deltaG
 
@@ -185,8 +253,9 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
         if it >= numItermax:
             loop = 0
 
-        delta_fval = (f_val - old_fval) / abs(f_val)
-        if abs(delta_fval) < stopThr:
+        abs_delta_fval = abs(f_val - old_fval)
+        relative_delta_fval = abs_delta_fval / abs(f_val)
+        if relative_delta_fval < stopThr or abs_delta_fval < stopThr2:
             loop = 0
 
         if log:
@@ -194,9 +263,9 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
 
         if verbose:
             if it % 20 == 0:
-                print('{:5s}|{:12s}|{:8s}'.format(
-                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
-            print('{:5d}|{:8e}|{:8e}'.format(it, f_val, delta_fval))
+                print('{:5s}|{:12s}|{:8s}|{:8s}'.format(
+                    'It.', 'Loss', 'Relative loss', 'Absolute loss') + '\n' + '-' * 48)
+            print('{:5d}|{:8e}|{:8e}|{:8e}'.format(it, f_val, relative_delta_fval, abs_delta_fval))
 
     if log:
         return G, log
@@ -205,7 +274,7 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
 
 
 def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
-        numInnerItermax=200, stopThr=1e-9, verbose=False, log=False):
+        numInnerItermax=200, stopThr=1e-9, stopThr2=1e-9, verbose=False, log=False):
     """
     Solve the general regularized OT problem with the generalized conditional gradient
 
@@ -231,24 +300,26 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Parameters
     ----------
-    a : np.ndarray (ns,)
+    a : ndarray, shape (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
+    b : ndarrayv (nt,)
         samples in the target domain
-    M : np.ndarray (ns,nt)
+    M : ndarray, shape (ns, nt)
         loss matrix
     reg1 : float
         Entropic Regularization term >0
     reg2 : float
         Second Regularization term >0
-    G0 :  np.ndarray (ns,nt), optional
+    G0 : ndarray, shape (ns, nt), optional
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
     numInnerItermax : int, optional
         Max number of iterations of Sinkhorn
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshol on the relative variation (>0)
+    stopThr2 : float, optional
+        Stop threshol on the absolute variation (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -256,15 +327,13 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
     Returns
     -------
-    gamma : (ns x nt) ndarray
+    gamma : ndarray, shape (ns, nt)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary return only if log==True in parameters
 
-
     References
     ----------
-
     .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
     .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
 
@@ -294,9 +363,9 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
     it = 0
 
     if verbose:
-        print('{:5s}|{:12s}|{:8s}'.format(
-            'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
-        print('{:5d}|{:8e}|{:8e}'.format(it, f_val, 0))
+        print('{:5s}|{:12s}|{:8s}|{:8s}'.format(
+            'It.', 'Loss', 'Relative loss', 'Absolute loss') + '\n' + '-' * 48)
+        print('{:5d}|{:8e}|{:8e}|{:8e}'.format(it, f_val, 0, 0))
 
     while loop:
 
@@ -322,8 +391,10 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
         if it >= numItermax:
             loop = 0
 
-        delta_fval = (f_val - old_fval) / abs(f_val)
-        if abs(delta_fval) < stopThr:
+        abs_delta_fval = abs(f_val - old_fval)
+        relative_delta_fval = abs_delta_fval / abs(f_val)
+
+        if relative_delta_fval < stopThr or abs_delta_fval < stopThr2:
             loop = 0
 
         if log:
@@ -331,11 +402,41 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
         if verbose:
             if it % 20 == 0:
-                print('{:5s}|{:12s}|{:8s}'.format(
-                    'It.', 'Loss', 'Delta loss') + '\n' + '-' * 32)
-            print('{:5d}|{:8e}|{:8e}'.format(it, f_val, delta_fval))
+                print('{:5s}|{:12s}|{:8s}|{:8s}'.format(
+                    'It.', 'Loss', 'Relative loss', 'Absolute loss') + '\n' + '-' * 48)
+            print('{:5d}|{:8e}|{:8e}|{:8e}'.format(it, f_val, relative_delta_fval, abs_delta_fval))
 
     if log:
         return G, log
     else:
         return G
+
+
+def solve_1d_linesearch_quad(a, b, c):
+    """
+    For any convex or non-convex 1d quadratic function f, solve on [0,1] the following problem:
+    .. math::
+        \argmin f(x)=a*x^{2}+b*x+c
+
+    Parameters
+    ----------
+    a,b,c : float
+        The coefficients of the quadratic function
+
+    Returns
+    -------
+    x : float
+        The optimal value which leads to the minimal cost
+    """
+    f0 = c
+    df0 = b
+    f1 = a + f0 + df0
+
+    if a > 0:  # convex
+        minimum = min(1, max(0, np.divide(-b, 2.0 * a)))
+        return minimum
+    else:  # non convex
+        if f0 > f1:
+            return 1
+        else:
+            return 0
diff --git a/ot/plot.py b/ot/plot.py
index 784a372..f403e98 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -1,5 +1,11 @@
 """
 Functions for plotting OT matrices
+
+.. warning::
+    Note that by default the module is not import in :mod:`ot`. In order to
+    use it you need to explicitely import :mod:`ot.plot`
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -20,11 +26,11 @@ def plot1D_mat(a, b, M, title=''):
 
     Parameters
     ----------
-    a : np.array, shape (na,)
+    a : ndarray, shape (na,)
         Source distribution
-    b : np.array, shape (nb,)
+    b : ndarray, shape (nb,)
         Target distribution
-    M : np.array, shape (na,nb)
+    M : ndarray, shape (na, nb)
         Matrix to plot
     """
     na, nb = M.shape
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 4795d88..13ed9cc 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -1,3 +1,9 @@
+"""
+Stochastic solvers for regularized OT.
+
+
+"""
+
 # Author: Kilian Fatras <kilian.fatras@gmail.com>
 #
 # License: MIT License
@@ -11,7 +17,7 @@ import numpy as np
 
 
 def coordinate_grad_semi_dual(b, M, reg, beta, i):
-    '''
+    r'''
     Compute the coordinate gradient update for regularized discrete distributions for (i, :)
 
     The function computes the gradient of the semi dual problem:
@@ -32,51 +38,49 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
-    reg : float nu
-        Regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
-    i : number int
-        picked number i
+    b : ndarray, shape (nt,)
+        Target measure.
+    M : ndarray, shape (ns, nt)
+        Cost matrix.
+    reg : float
+        Regularization term > 0.
+    v : ndarray, shape (nt,)
+        Dual variable.
+    i : int
+        Picked number i.
 
     Returns
     -------
-
-    coordinate gradient : np.ndarray(nt,)
+    coordinate gradient : ndarray, shape (nt,)
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
+
 
     References
     ----------
-
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
-
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
-
     r = M[i, :] - beta
     exp_beta = np.exp(-r / reg) * b
     khi = exp_beta / (np.sum(exp_beta))
@@ -84,7 +88,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
 
 
 def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
-    '''
+    r'''
     Compute the SAG algorithm to solve the regularized discrete measures
         optimal transport max problem
 
@@ -112,42 +116,43 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     Parameters
     ----------
 
-    a : np.ndarray(ns,),
-        source measure
-    b : np.ndarray(nt,),
-        target measure
-    M : np.ndarray(ns, nt),
-        cost matrix
-    reg : float number,
+    a : ndarray, shape (ns,),
+        Source measure.
+    b : ndarray, shape (nt,),
+        Target measure.
+    M : ndarray, shape (ns, nt),
+        Cost matrix.
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    v : np.ndarray(nt,)
-        dual variable
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
@@ -176,7 +181,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
 
 
 def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
-    '''
+    r'''
     Compute the ASGD algorithm to solve the regularized semi continous measures optimal transport max problem
 
     The function solves the following optimization problem:
@@ -202,50 +207,49 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    numItermax : int number
-        number of iteration
-    lr : float number
-        learning rate
-
+    numItermax : int
+        Number of iteration.
+    lr : float
+        Learning rate.
 
     Returns
     -------
-
-    ave_v : np.ndarray(nt,)
+    ave_v : ndarray, shape (nt,)
         dual variable
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     if lr is None:
@@ -264,7 +268,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
 
 
 def c_transform_entropic(b, M, reg, beta):
-    '''
+    r'''
     The goal is to recover u from the c-transform.
 
     The function computes the c_transform of a dual variable from the other
@@ -285,47 +289,47 @@ def c_transform_entropic(b, M, reg, beta):
 
     Parameters
     ----------
-
-    b : np.ndarray(nt,)
-        target measure
-    M : np.ndarray(ns, nt)
-        cost matrix
+    b : ndarray, shape (nt,)
+        Target measure
+    M : ndarray, shape (ns, nt)
+        Cost matrix
     reg : float
-        regularization term > 0
-    v : np.ndarray(nt,)
-        dual variable
+        Regularization term > 0
+    v : ndarray, shape (nt,)
+        Dual variable.
 
     Returns
     -------
-
-    u : np.ndarray(ns,)
-        dual variable
+    u : ndarray, shape (ns,)
+        Dual variable.
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
 
     [Genevay et al., 2016] :
-                    Stochastic Optimization for Large-scale Optimal Transport,
-                     Advances in Neural Information Processing Systems (2016),
-                      arXiv preprint arxiv:1605.08527.
+        Stochastic Optimization for Large-scale Optimal Transport,
+         Advances in Neural Information Processing Systems (2016),
+          arXiv preprint arxiv:1605.08527.
     '''
 
     n_source = np.shape(M)[0]
@@ -340,7 +344,7 @@ def c_transform_entropic(b, M, reg, beta):
 
 def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                 log=False):
-    '''
+    r'''
     Compute the transportation matrix to solve the regularized discrete
         measures optimal transport max problem
 
@@ -348,8 +352,11 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
 
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
         s.t. \gamma 1 = a
+
              \gamma^T 1= b
+
              \gamma \geq 0
 
     Where :
@@ -364,52 +371,53 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
     Parameters
     ----------
 
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
     methode : str
         used method (SAG or ASGD)
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
-    n_source : int number
+    n_source : int
         size of the source measure
-    n_target : int number
+    n_target : int
         size of the target measure
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 300000
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "ASGD"
-    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                      method, numItermax)
-    >>> print(asgd_pi)
+    >>> ot.stochastic.solve_semi_dual_entropic(a, b, M, reg=1, method="ASGD", numItermax=300000)
+    array([[2.53942342e-02, 9.98640673e-02, 1.75945647e-02, 4.27664307e-06],
+           [1.21556999e-01, 1.26350515e-02, 1.30491795e-03, 7.36017394e-03],
+           [3.54070702e-03, 7.63581358e-02, 6.29581672e-02, 1.32812798e-07],
+           [2.60578198e-02, 3.35916645e-02, 8.28023223e-02, 4.05336238e-04],
+           [9.86808864e-03, 7.59774324e-04, 1.08702729e-02, 1.21359007e-01],
+           [2.17218856e-02, 9.12931802e-04, 1.87962526e-03, 1.18342700e-01],
+           [4.14237512e-02, 2.67487857e-02, 7.23016955e-02, 2.38291052e-03]])
 
     References
     ----------
@@ -448,7 +456,7 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
 
 def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
                     batch_beta):
-    '''
+    r'''
     Computes the partial gradient of the dual optimal transport problem.
 
     For each (i,j) in a batch of coordinates, the partial gradients are :
@@ -475,53 +483,55 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
-    batch_size : int number
+    batch_size : int
         size of the batch
-    batch_alpha : np.ndarray(bs,)
+    batch_alpha : ndarray, shape (bs,)
         batch of index of alpha
-    batch_beta : np.ndarray(bs,)
+    batch_beta : ndarray, shape (bs,)
         batch of index of beta
 
     Returns
     -------
-
-    grad : np.ndarray(ns,)
+    grad : ndarray, shape (ns,)
         partial grad F
 
     Examples
     --------
-
+    >>> import ot
+    >>> np.random.seed(0)
     >>> n_source = 7
     >>> n_target = 4
-    >>> reg = 1
-    >>> numItermax = 20000
-    >>> lr = 0.1
-    >>> batch_size = 3
-    >>> log = True
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
-    >>> rng = np.random.RandomState(0)
-    >>> X_source = rng.randn(n_source, 2)
-    >>> Y_target = rng.randn(n_target, 2)
+    >>> X_source = np.random.randn(n_source, 2)
+    >>> Y_target = np.random.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                            batch_size,
-                                                            numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg=1, batch_size=3, numItermax=30000, lr=0.1, log=True)
+    >>> log['alpha']
+    array([0.71759102, 1.57057384, 0.85576566, 0.1208211 , 0.59190466,
+           1.197148  , 0.17805133])
+    >>> log['beta']
+    array([0.49741367, 0.57478564, 1.40075528, 2.75890102])
+    >>> sgd_dual_pi
+    array([[2.09730063e-02, 8.38169324e-02, 7.50365455e-03, 8.72731415e-09],
+           [5.58432437e-03, 5.89881299e-04, 3.09558411e-05, 8.35469849e-07],
+           [3.26489515e-03, 7.15536035e-02, 2.99778211e-02, 3.02601593e-10],
+           [4.05390622e-02, 5.31085068e-02, 6.65191787e-02, 1.55812785e-06],
+           [7.82299812e-02, 6.12099102e-03, 4.44989098e-02, 2.37719187e-03],
+           [5.06266486e-02, 2.16230494e-03, 2.26215141e-03, 6.81514609e-04],
+           [6.06713990e-02, 3.98139808e-02, 5.46829338e-02, 8.62371424e-06]])
 
     References
     ----------
@@ -530,22 +540,21 @@ def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
                     International Conference on Learning Representation (2018),
                       arXiv preprint arxiv:1711.02283.
     '''
-
     G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
                    M[batch_alpha, :][:, batch_beta]) / reg) *
            a[batch_alpha, None] * b[None, batch_beta])
     grad_beta = np.zeros(np.shape(M)[1])
     grad_alpha = np.zeros(np.shape(M)[0])
-    grad_beta[batch_beta] = (b[batch_beta] * len(batch_alpha) / np.shape(M)[0] +
-                             G.sum(0))
-    grad_alpha[batch_alpha] = (a[batch_alpha] * len(batch_beta) /
-                               np.shape(M)[1] + G.sum(1))
+    grad_beta[batch_beta] = (b[batch_beta] * len(batch_alpha) / np.shape(M)[0]
+                             + G.sum(0))
+    grad_alpha[batch_alpha] = (a[batch_alpha] * len(batch_beta)
+                               / np.shape(M)[1] + G.sum(1))
 
     return grad_alpha, grad_beta
 
 
 def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
-    '''
+    r'''
     Compute the sgd algorithm to solve the regularized discrete measures
         optimal transport dual problem
 
@@ -568,33 +577,31 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
 
     Returns
     -------
-
-    alpha : np.ndarray(ns,)
+    alpha : ndarray, shape (ns,)
         dual variable
-    beta : np.ndarray(nt,)
+    beta : ndarray, shape (nt,)
         dual variable
 
     Examples
     --------
-
+    >>> import ot
     >>> n_source = 7
     >>> n_target = 4
     >>> reg = 1
@@ -608,18 +615,26 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
     >>> X_source = rng.randn(n_source, 2)
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                          batch_size,
-                                                          numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
+    >>> log['alpha']
+    array([0.64171798, 1.27932201, 0.78132257, 0.15638935, 0.54888354,
+           1.03663469, 0.20595781])
+    >>> log['beta']
+    array([0.51207194, 0.58033189, 1.28922676, 2.26859736])
+    >>> sgd_dual_pi
+    array([[1.97276541e-02, 7.81248547e-02, 6.22136048e-03, 4.95442423e-09],
+           [4.23494310e-03, 4.43286263e-04, 2.06927079e-05, 3.82389139e-07],
+           [3.07542414e-03, 6.67897769e-02, 2.48904999e-02, 1.72030247e-10],
+           [4.26271990e-02, 5.53375455e-02, 6.16535024e-02, 9.88812650e-07],
+           [7.60423265e-02, 5.89585256e-03, 3.81267087e-02, 1.39458256e-03],
+           [4.37557504e-02, 1.85189176e-03, 1.72335760e-03, 3.55491279e-04],
+           [6.33096109e-02, 4.11683954e-02, 5.02962051e-02, 5.43097516e-06]])
 
     References
     ----------
-
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     n_source = np.shape(M)[0]
@@ -641,7 +656,7 @@ def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
 
 def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
                         log=False):
-    '''
+    r'''
     Compute the transportation matrix to solve the regularized discrete measures
         optimal transport dual problem
 
@@ -664,35 +679,33 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     Parameters
     ----------
-
-    a : np.ndarray(ns,)
+    a : ndarray, shape (ns,)
         source measure
-    b : np.ndarray(nt,)
+    b : ndarray, shape (nt,)
         target measure
-    M : np.ndarray(ns, nt)
+    M : ndarray, shape (ns, nt)
         cost matrix
-    reg : float number
+    reg : float
         Regularization term > 0
-    batch_size : int number
+    batch_size : int
         size of the batch
-    numItermax : int number
+    numItermax : int
         number of iteration
-    lr : float number
+    lr : float
         learning rate
     log : bool, optional
         record log if True
 
     Returns
     -------
-
-    pi : np.ndarray(ns, nt)
+    pi : ndarray, shape (ns, nt)
         transportation matrix
     log : dict
         log dictionary return only if log==True in parameters
 
     Examples
     --------
-
+    >>> import ot
     >>> n_source = 7
     >>> n_target = 4
     >>> reg = 1
@@ -706,18 +719,27 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
     >>> X_source = rng.randn(n_source, 2)
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
-                                                            batch_size,
-                                                            numItermax, lr, log)
-    >>> print(log['alpha'], log['beta'])
-    >>> print(sgd_dual_pi)
+    >>> sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size, numItermax, lr, log)
+    >>> log['alpha']
+    array([0.64057733, 1.2683513 , 0.75610161, 0.16024284, 0.54926534,
+           1.0514201 , 0.19958936])
+    >>> log['beta']
+    array([0.51372571, 0.58843489, 1.27993921, 2.24344807])
+    >>> sgd_dual_pi
+    array([[1.97377795e-02, 7.86706853e-02, 6.15682001e-03, 4.82586997e-09],
+           [4.19566963e-03, 4.42016865e-04, 2.02777272e-05, 3.68823708e-07],
+           [3.00379244e-03, 6.56562018e-02, 2.40462171e-02, 1.63579656e-10],
+           [4.28626062e-02, 5.60031599e-02, 6.13193826e-02, 9.67977735e-07],
+           [7.61972739e-02, 5.94609051e-03, 3.77886693e-02, 1.36046648e-03],
+           [4.44810042e-02, 1.89476742e-03, 1.73285847e-03, 3.51826036e-04],
+           [6.30118293e-02, 4.12398660e-02, 4.95148998e-02, 5.26247246e-06]])
 
     References
     ----------
 
     [Seguy et al., 2018] :
-                    International Conference on Learning Representation (2018),
-                      arXiv preprint arxiv:1711.02283.
+        International Conference on Learning Representation (2018),
+          arXiv preprint arxiv:1711.02283.
     '''
 
     opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
new file mode 100644
index 0000000..23f6607
--- /dev/null
+++ b/ot/unbalanced.py
@@ -0,0 +1,1023 @@
+# -*- coding: utf-8 -*-
+"""
+Regularized Unbalanced OT
+"""
+
+# Author: Hicham Janati <hicham.janati@inria.fr>
+# License: MIT License
+
+from __future__ import division
+import warnings
+import numpy as np
+from scipy.special import logsumexp
+
+# from .utils import unif, dist
+
+
+def sinkhorn_unbalanced(a, b, M, reg, reg_m, method='sinkhorn', numItermax=1000,
+                        stopThr=1e-6, verbose=False, log=False, **kwargs):
+    r"""
+    Solve the unbalanced entropic regularization optimal transport problem
+    and return the OT plan
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (dim_a, dim_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization
+        term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target unbalanced distributions
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized
+        Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
+        One or multiple unnormalized histograms of dimension dim_b
+        If many, compute all the OT distances (a, b_i)
+    M : np.ndarray (dim_a, dim_b)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    method : str
+        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        'sinkhorn_reg_scaling', see those function for specific parameters
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    if n_hists == 1:
+        gamma : (dim_a x dim_b) ndarray
+            Optimal transportation matrix for the given parameters
+        log : dict
+            log dictionary returned only if `log` is `True`
+    else:
+        ot_distance : (n_hists,) ndarray
+            the OT distance between `a` and each of the histograms `b_i`
+        log : dict
+            log dictionary returned only if `log` is `True`
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn_unbalanced(a, b, M, 1, 1)
+    array([[0.51122823, 0.18807035],
+           [0.18807035, 0.51122823]])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
+        Transport, Advances in Neural Information Processing Systems
+        (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for
+        Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprint
+        arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. :
+        Learning with a Wasserstein Loss,  Advances in Neural Information
+        Processing Systems (NIPS) 2015
+
+
+    See Also
+    --------
+    ot.unbalanced.sinkhorn_knopp_unbalanced : Unbalanced Classic Sinkhorn [10]
+    ot.unbalanced.sinkhorn_stabilized_unbalanced:
+        Unbalanced Stabilized sinkhorn [9][10]
+    ot.unbalanced.sinkhorn_reg_scaling_unbalanced:
+        Unbalanced Sinkhorn with epslilon scaling [9][10]
+
+    """
+
+    if method.lower() == 'sinkhorn':
+        return sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m,
+                                         numItermax=numItermax,
+                                         stopThr=stopThr, verbose=verbose,
+                                         log=log, **kwargs)
+
+    elif method.lower() == 'sinkhorn_stabilized':
+        return sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m,
+                                              numItermax=numItermax,
+                                              stopThr=stopThr,
+                                              verbose=verbose,
+                                              log=log, **kwargs)
+    elif method.lower() in ['sinkhorn_reg_scaling']:
+        warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
+        return sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m,
+                                         numItermax=numItermax,
+                                         stopThr=stopThr, verbose=verbose,
+                                         log=log, **kwargs)
+    else:
+        raise ValueError("Unknown method '%s'." % method)
+
+
+def sinkhorn_unbalanced2(a, b, M, reg, reg_m, method='sinkhorn',
+                         numItermax=1000, stopThr=1e-6, verbose=False,
+                         log=False, **kwargs):
+    r"""
+    Solve the entropic regularization unbalanced optimal transport problem and
+    return the loss
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (dim_a, dim_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target unbalanced distributions
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
+        One or multiple unnormalized histograms of dimension dim_b
+        If many, compute all the OT distances (a, b_i)
+    M : np.ndarray (dim_a, dim_b)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    method : str
+        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        'sinkhorn_reg_scaling', see those function for specific parameters
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    ot_distance : (n_hists,) ndarray
+        the OT distance between `a` and each of the histograms `b_i`
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .10]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.],[1., 0.]]
+    >>> ot.unbalanced.sinkhorn_unbalanced2(a, b, M, 1., 1.)
+    array([0.31912866])
+
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
+        Transport, Advances in Neural Information Processing Systems
+        (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for
+        Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprint
+        arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. :
+        Learning with a Wasserstein Loss,  Advances in Neural Information
+        Processing Systems (NIPS) 2015
+
+    See Also
+    --------
+    ot.unbalanced.sinkhorn_knopp : Unbalanced Classic Sinkhorn [10]
+    ot.unbalanced.sinkhorn_stabilized: Unbalanced Stabilized sinkhorn [9][10]
+    ot.unbalanced.sinkhorn_reg_scaling: Unbalanced Sinkhorn with epslilon scaling [9][10]
+
+    """
+    b = np.asarray(b, dtype=np.float64)
+    if len(b.shape) < 2:
+        b = b[:, None]
+    if method.lower() == 'sinkhorn':
+        return sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m,
+                                         numItermax=numItermax,
+                                         stopThr=stopThr, verbose=verbose,
+                                         log=log, **kwargs)
+
+    elif method.lower() == 'sinkhorn_stabilized':
+        return sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m,
+                                              numItermax=numItermax,
+                                              stopThr=stopThr,
+                                              verbose=verbose,
+                                              log=log, **kwargs)
+    elif method.lower() in ['sinkhorn_reg_scaling']:
+        warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
+        return sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m,
+                                         numItermax=numItermax,
+                                         stopThr=stopThr, verbose=verbose,
+                                         log=log, **kwargs)
+    else:
+        raise ValueError('Unknown method %s.' % method)
+
+
+def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
+                              stopThr=1e-6, verbose=False, log=False, **kwargs):
+    r"""
+    Solve the entropic regularization unbalanced optimal transport problem and return the loss
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + \reg_m KL(\gamma 1, a) + \reg_m KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (dim_a, dim_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target unbalanced distributions
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
+        One or multiple unnormalized histograms of dimension dim_b
+        If many, compute all the OT distances (a, b_i)
+    M : np.ndarray (dim_a, dim_b)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (> 0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    if n_hists == 1:
+        gamma : (dim_a x dim_b) ndarray
+            Optimal transportation matrix for the given parameters
+        log : dict
+            log dictionary returned only if `log` is `True`
+    else:
+        ot_distance : (n_hists,) ndarray
+            the OT distance between `a` and each of the histograms `b_i`
+        log : dict
+            log dictionary returned only if `log` is `True`
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.],[1., 0.]]
+    >>> ot.unbalanced.sinkhorn_knopp_unbalanced(a, b, M, 1., 1.)
+    array([[0.51122823, 0.18807035],
+           [0.18807035, 0.51122823]])
+
+    References
+    ----------
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprint
+        arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. :
+        Learning with a Wasserstein Loss,  Advances in Neural Information
+        Processing Systems (NIPS) 2015
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    dim_a, dim_b = M.shape
+
+    if len(a) == 0:
+        a = np.ones(dim_a, dtype=np.float64) / dim_a
+    if len(b) == 0:
+        b = np.ones(dim_b, dtype=np.float64) / dim_b
+
+    if len(b.shape) > 1:
+        n_hists = b.shape[1]
+    else:
+        n_hists = 0
+
+    if log:
+        log = {'err': []}
+
+    # we assume that no distances are null except those of the diagonal of
+    # distances
+    if n_hists:
+        u = np.ones((dim_a, 1)) / dim_a
+        v = np.ones((dim_b, n_hists)) / dim_b
+        a = a.reshape(dim_a, 1)
+    else:
+        u = np.ones(dim_a) / dim_a
+        v = np.ones(dim_b) / dim_b
+
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    fi = reg_m / (reg_m + reg)
+
+    err = 1.
+
+    for i in range(numItermax):
+        uprev = u
+        vprev = v
+
+        Kv = K.dot(v)
+        u = (a / Kv) ** fi
+        Ktu = K.T.dot(u)
+        v = (b / Ktu) ** fi
+
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % i)
+            u = uprev
+            v = vprev
+            break
+
+        err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.)
+        err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.)
+        err = 0.5 * (err_u + err_v)
+        if log:
+            log['err'].append(err)
+            if verbose:
+                if i % 50 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(i, err))
+        if err < stopThr:
+            break
+
+    if log:
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
+
+    if n_hists:  # return only loss
+        res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
+        if log:
+            return res, log
+        else:
+            return res
+
+    else:  # return OT matrix
+
+        if log:
+            return u[:, None] * K * v[None, :], log
+        else:
+            return u[:, None] * K * v[None, :]
+
+
+def sinkhorn_stabilized_unbalanced(a, b, M, reg, reg_m, tau=1e5, numItermax=1000,
+                                   stopThr=1e-6, verbose=False, log=False,
+                                   **kwargs):
+    r"""
+    Solve the entropic regularization unbalanced optimal transport
+    problem and return the loss
+
+    The function solves the following optimization problem using log-domain
+    stabilization as proposed in [10]:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + reg_m KL(\gamma 1, a) + reg_m KL(\gamma^T 1, b)
+
+        s.t.
+             \gamma\geq 0
+    where :
+
+    - M is the (dim_a, dim_b) metric cost matrix
+    - :math:`\Omega` is the entropic regularization
+        term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target unbalanced distributions
+    - KL is the Kullback-Leibler divergence
+
+    The algorithm used for solving the problem is the generalized
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists)
+        One or multiple unnormalized histograms of dimension dim_b
+        If many, compute all the OT distances (a, b_i)
+    M : np.ndarray (dim_a, dim_b)
+        loss matrix
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    tau : float
+        thershold for max value in u or v for log scaling
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    if n_hists == 1:
+        gamma : (dim_a x dim_b) ndarray
+            Optimal transportation matrix for the given parameters
+        log : dict
+            log dictionary returned only if `log` is `True`
+    else:
+        ot_distance : (n_hists,) ndarray
+            the OT distance between `a` and each of the histograms `b_i`
+        log : dict
+            log dictionary returned only if `log` is `True`
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> M=[[0., 1.],[1., 0.]]
+    >>> ot.unbalanced.sinkhorn_stabilized_unbalanced(a, b, M, 1., 1.)
+    array([[0.51122823, 0.18807035],
+           [0.18807035, 0.51122823]])
+
+    References
+    ----------
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+
+    .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. :
+        Learning with a Wasserstein Loss,  Advances in Neural Information
+        Processing Systems (NIPS) 2015
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    dim_a, dim_b = M.shape
+
+    if len(a) == 0:
+        a = np.ones(dim_a, dtype=np.float64) / dim_a
+    if len(b) == 0:
+        b = np.ones(dim_b, dtype=np.float64) / dim_b
+
+    if len(b.shape) > 1:
+        n_hists = b.shape[1]
+    else:
+        n_hists = 0
+
+    if log:
+        log = {'err': []}
+
+    # we assume that no distances are null except those of the diagonal of
+    # distances
+    if n_hists:
+        u = np.ones((dim_a, n_hists)) / dim_a
+        v = np.ones((dim_b, n_hists)) / dim_b
+        a = a.reshape(dim_a, 1)
+    else:
+        u = np.ones(dim_a) / dim_a
+        v = np.ones(dim_b) / dim_b
+
+    # print(reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    fi = reg_m / (reg_m + reg)
+
+    cpt = 0
+    err = 1.
+    alpha = np.zeros(dim_a)
+    beta = np.zeros(dim_b)
+    while (err > stopThr and cpt < numItermax):
+        uprev = u
+        vprev = v
+
+        Kv = K.dot(v)
+        f_alpha = np.exp(- alpha / (reg + reg_m))
+        f_beta = np.exp(- beta / (reg + reg_m))
+
+        if n_hists:
+            f_alpha = f_alpha[:, None]
+            f_beta = f_beta[:, None]
+        u = ((a / (Kv + 1e-16)) ** fi) * f_alpha
+        Ktu = K.T.dot(u)
+        v = ((b / (Ktu + 1e-16)) ** fi) * f_beta
+        absorbing = False
+        if (u > tau).any() or (v > tau).any():
+            absorbing = True
+            if n_hists:
+                alpha = alpha + reg * np.log(np.max(u, 1))
+                beta = beta + reg * np.log(np.max(v, 1))
+            else:
+                alpha = alpha + reg * np.log(np.max(u))
+                beta = beta + reg * np.log(np.max(v))
+            K = np.exp((alpha[:, None] + beta[None, :] -
+                        M) / reg)
+            v = np.ones_like(v)
+        Kv = K.dot(v)
+
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % cpt)
+            u = uprev
+            v = vprev
+            break
+        if (cpt % 10 == 0 and not absorbing) or cpt == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(),
+                                             1.)
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if cpt % 200 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+        cpt = cpt + 1
+
+    if err > stopThr:
+        warnings.warn("Stabilized Unbalanced Sinkhorn did not converge." +
+                      "Try a larger entropy `reg` or a lower mass `reg_m`." +
+                      "Or a larger absorption threshold `tau`.")
+    if n_hists:
+        logu = alpha[:, None] / reg + np.log(u)
+        logv = beta[:, None] / reg + np.log(v)
+    else:
+        logu = alpha / reg + np.log(u)
+        logv = beta / reg + np.log(v)
+    if log:
+        log['logu'] = logu
+        log['logv'] = logv
+    if n_hists:  # return only loss
+        res = logsumexp(np.log(M + 1e-100)[:, :, None] + logu[:, None, :] +
+                        logv[None, :, :] - M[:, :, None] / reg, axis=(0, 1))
+        res = np.exp(res)
+        if log:
+            return res, log
+        else:
+            return res
+
+    else:  # return OT matrix
+        ot_matrix = np.exp(logu[:, None] + logv[None, :] - M / reg)
+        if log:
+            return ot_matrix, log
+        else:
+            return ot_matrix
+
+
+def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
+                                     numItermax=1000, stopThr=1e-6,
+                                     verbose=False, log=False):
+    r"""Compute the entropic unbalanced wasserstein barycenter of A with stabilization.
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
+        Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of
+        matrix :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and
+        the cost matrix for OT
+    - reg_mis the marginal relaxation hyperparameter
+        The algorithm used for solving the problem is the generalized
+        Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+
+    Parameters
+    ----------
+    A : np.ndarray (dim, n_hists)
+        `n_hists` training distributions a_i of dimension dim
+    M : np.ndarray (dim, dim)
+        ground metric matrix for OT.
+    reg : float
+        Entropy regularization term > 0
+    reg_m : float
+        Marginal relaxation term > 0
+    tau : float
+        Stabilization threshold for log domain absorption.
+    weights : np.ndarray (n_hists,) optional
+        Weight of each distribution (barycentric coodinates)
+        If None, uniform weights are used.
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (> 0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (dim,) ndarray
+        Unbalanced Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré,
+        G. (2015). Iterative Bregman projections for regularized transportation
+        problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprint
+        arXiv:1607.05816.
+
+
+    """
+    dim, n_hists = A.shape
+    if weights is None:
+        weights = np.ones(n_hists) / n_hists
+    else:
+        assert(len(weights) == A.shape[1])
+
+    if log:
+        log = {'err': []}
+
+    fi = reg_m / (reg_m + reg)
+
+    u = np.ones((dim, n_hists)) / dim
+    v = np.ones((dim, n_hists)) / dim
+
+    # print(reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    fi = reg_m / (reg_m + reg)
+
+    cpt = 0
+    err = 1.
+    alpha = np.zeros(dim)
+    beta = np.zeros(dim)
+    q = np.ones(dim) / dim
+    for i in range(numItermax):
+        qprev = q.copy()
+        Kv = K.dot(v)
+        f_alpha = np.exp(- alpha / (reg + reg_m))
+        f_beta = np.exp(- beta / (reg + reg_m))
+        f_alpha = f_alpha[:, None]
+        f_beta = f_beta[:, None]
+        u = ((A / (Kv + 1e-16)) ** fi) * f_alpha
+        Ktu = K.T.dot(u)
+        q = (Ktu ** (1 - fi)) * f_beta
+        q = q.dot(weights) ** (1 / (1 - fi))
+        Q = q[:, None]
+        v = ((Q / (Ktu + 1e-16)) ** fi) * f_beta
+        absorbing = False
+        if (u > tau).any() or (v > tau).any():
+            absorbing = True
+            alpha = alpha + reg * np.log(np.max(u, 1))
+            beta = beta + reg * np.log(np.max(v, 1))
+            K = np.exp((alpha[:, None] + beta[None, :] -
+                        M) / reg)
+            v = np.ones_like(v)
+        Kv = K.dot(v)
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % cpt)
+            q = qprev
+            break
+        if (i % 10 == 0 and not absorbing) or i == 0:
+            # we can speed up the process by checking for the error only all
+            # the 10th iterations
+            err = abs(q - qprev).max() / max(abs(q).max(),
+                                             abs(qprev).max(), 1.)
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if i % 50 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(i, err))
+            if err < stopThr:
+                break
+
+    if err > stopThr:
+        warnings.warn("Stabilized Unbalanced Sinkhorn did not converge." +
+                      "Try a larger entropy `reg` or a lower mass `reg_m`." +
+                      "Or a larger absorption threshold `tau`.")
+    if log:
+        log['niter'] = i
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
+        return q, log
+    else:
+        return q
+
+
+def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
+                                   numItermax=1000, stopThr=1e-6,
+                                   verbose=False, log=False):
+    r"""Compute the entropic unbalanced wasserstein barycenter of A.
+
+     The function solves the following optimization problem with a
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
+    Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
+    :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and
+    the cost matrix for OT
+    - reg_mis the marginal relaxation hyperparameter
+    The algorithm used for solving the problem is the generalized
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+
+    Parameters
+    ----------
+    A : np.ndarray (dim, n_hists)
+        `n_hists` training distributions a_i of dimension dim
+    M : np.ndarray (dim, dim)
+        ground metric matrix for OT.
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    weights : np.ndarray (n_hists,) optional
+        Weight of each distribution (barycentric coodinates)
+        If None, uniform weights are used.
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (> 0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (dim,) ndarray
+        Unbalanced Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
+        (2015). Iterative Bregman projections for regularized transportation
+        problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprin
+        arXiv:1607.05816.
+
+
+    """
+    dim, n_hists = A.shape
+    if weights is None:
+        weights = np.ones(n_hists) / n_hists
+    else:
+        assert(len(weights) == A.shape[1])
+
+    if log:
+        log = {'err': []}
+
+    K = np.exp(- M / reg)
+
+    fi = reg_m / (reg_m + reg)
+
+    v = np.ones((dim, n_hists))
+    u = np.ones((dim, 1))
+    q = np.ones(dim)
+    err = 1.
+
+    for i in range(numItermax):
+        uprev = u.copy()
+        vprev = v.copy()
+        qprev = q.copy()
+
+        Kv = K.dot(v)
+        u = (A / Kv) ** fi
+        Ktu = K.T.dot(u)
+        q = ((Ktu ** (1 - fi)).dot(weights))
+        q = q ** (1 / (1 - fi))
+        Q = q[:, None]
+        v = (Q / Ktu) ** fi
+
+        if (np.any(Ktu == 0.)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
+            # we have reached the machine precision
+            # come back to previous solution and quit loop
+            warnings.warn('Numerical errors at iteration %s' % i)
+            u = uprev
+            v = vprev
+            q = qprev
+            break
+            # compute change in barycenter
+        err = abs(q - qprev).max()
+        err /= max(abs(q).max(), abs(qprev).max(), 1.)
+        if log:
+            log['err'].append(err)
+        # if barycenter did not change + at least 10 iterations - stop
+        if err < stopThr and i > 10:
+            break
+
+        if verbose:
+            if i % 10 == 0:
+                print(
+                    '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+            print('{:5d}|{:8e}|'.format(i, err))
+
+    if log:
+        log['niter'] = i
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
+        return q, log
+    else:
+        return q
+
+
+def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None,
+                          numItermax=1000, stopThr=1e-6,
+                          verbose=False, log=False, **kwargs):
+    r"""Compute the entropic unbalanced wasserstein barycenter of A.
+
+     The function solves the following optimization problem with a
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i Wu_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`Wu_{reg}(\cdot,\cdot)` is the unbalanced entropic regularized
+    Wasserstein distance (see ot.unbalanced.sinkhorn_unbalanced)
+    - :math:`\mathbf{a}_i` are training distributions in the columns of matrix
+    :math:`\mathbf{A}`
+    - reg and :math:`\mathbf{M}` are respectively the regularization term and
+    the cost matrix for OT
+    - reg_mis the marginal relaxation hyperparameter
+    The algorithm used for solving the problem is the generalized
+    Sinkhorn-Knopp matrix scaling algorithm as proposed in [10]_
+
+    Parameters
+    ----------
+    A : np.ndarray (dim, n_hists)
+        `n_hists` training distributions a_i of dimension dim
+    M : np.ndarray (dim, dim)
+        ground metric matrix for OT.
+    reg : float
+        Entropy regularization term > 0
+    reg_m: float
+        Marginal relaxation term > 0
+    weights : np.ndarray (n_hists,) optional
+        Weight of each distribution (barycentric coodinates)
+        If None, uniform weights are used.
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (> 0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (dim,) ndarray
+        Unbalanced Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
+        (2015). Iterative Bregman projections for regularized transportation
+        problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
+        Scaling algorithms for unbalanced transport problems. arXiv preprin
+        arXiv:1607.05816.
+
+    """
+
+    if method.lower() == 'sinkhorn':
+        return barycenter_unbalanced_sinkhorn(A, M, reg, reg_m,
+                                              weights=weights,
+                                              numItermax=numItermax,
+                                              stopThr=stopThr, verbose=verbose,
+                                              log=log, **kwargs)
+
+    elif method.lower() == 'sinkhorn_stabilized':
+        return barycenter_unbalanced_stabilized(A, M, reg, reg_m,
+                                                weights=weights,
+                                                numItermax=numItermax,
+                                                stopThr=stopThr,
+                                                verbose=verbose,
+                                                log=log, **kwargs)
+    elif method.lower() in ['sinkhorn_reg_scaling']:
+        warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
+        return barycenter_unbalanced(A, M, reg, reg_m,
+                                     weights=weights,
+                                     numItermax=numItermax,
+                                     stopThr=stopThr, verbose=verbose,
+                                     log=log, **kwargs)
+    else:
+        raise ValueError("Unknown method '%s'." % method)
diff --git a/ot/utils.py b/ot/utils.py
index bb21b38..b71458b 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Various function that can be usefull
+Various useful functions
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -111,12 +111,12 @@ def dist(x1, x2=None, metric='sqeuclidean'):
     Parameters
     ----------
 
-    x1 : np.array (n1,d)
+    x1 : ndarray, shape (n1,d)
         matrix with n1 samples of size d
-    x2 : np.array (n2,d), optional
+    x2 : array, shape (n2,d), optional
         matrix with n2 samples of size d (if None then x2=x1)
-    metric : str, fun, optional
-        name of the metric to be computed (full list in the doc of scipy),  If a string,
+    metric : str | callable, optional
+        Name of the metric to be computed (full list in the doc of scipy),  If a string,
         the distance function can be 'braycurtis', 'canberra', 'chebyshev', 'cityblock',
         'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski',
         'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
@@ -138,26 +138,21 @@ def dist(x1, x2=None, metric='sqeuclidean'):
 
 
 def dist0(n, method='lin_square'):
-    """Compute standard cost matrices of size (n,n) for OT problems
+    """Compute standard cost matrices of size (n, n) for OT problems
 
     Parameters
     ----------
-
     n : int
-        size of the cost matrix
+        Size of the cost matrix.
     method : str, optional
         Type of loss matrix chosen from:
 
         * 'lin_square' : linear sampling between 0 and n-1, quadratic loss
 
-
     Returns
     -------
-
-    M : np.array (n1,n2)
-        distance matrix computed with given metric
-
-
+    M : ndarray, shape (n1,n2)
+        Distance matrix computed with given metric.
     """
     res = 0
     if method == 'lin_square':
@@ -169,33 +164,34 @@ def dist0(n, method='lin_square'):
 def cost_normalization(C, norm=None):
     """ Apply normalization to the loss matrix
 
-
     Parameters
     ----------
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The cost matrix to normalize.
     norm : str
-        type of normalization from 'median','max','log','loglog'. Any other
-        value do not normalize.
-
+        Type of normalization from 'median', 'max', 'log', 'loglog'. Any
+        other value do not normalize.
 
     Returns
     -------
-
-    C : np.array (n1, n2)
+    C : ndarray, shape (n1, n2)
         The input cost matrix normalized according to given norm.
-
     """
 
-    if norm == "median":
+    if norm is None:
+        pass
+    elif norm == "median":
         C /= float(np.median(C))
     elif norm == "max":
         C /= float(np.max(C))
     elif norm == "log":
         C = np.log(1 + C)
     elif norm == "loglog":
-        C = np.log(1 + np.log(1 + C))
-
+        C = np.log1p(np.log1p(C))
+    else:
+        raise ValueError('Norm %s is not a valid option.\n'
+                         'Valid options are:\n'
+                         'median, max, log, loglog' % norm)
     return C
 
 
@@ -214,23 +210,28 @@ def fun(f, q_in, q_out):
 
 
 def parmap(f, X, nprocs=multiprocessing.cpu_count()):
-    """ paralell map for multiprocessing """
-    q_in = multiprocessing.Queue(1)
-    q_out = multiprocessing.Queue()
+    """ paralell map for multiprocessing (only map on windows)"""
 
-    proc = [multiprocessing.Process(target=fun, args=(f, q_in, q_out))
-            for _ in range(nprocs)]
-    for p in proc:
-        p.daemon = True
-        p.start()
+    if not sys.platform.endswith('win32'):
 
-    sent = [q_in.put((i, x)) for i, x in enumerate(X)]
-    [q_in.put((None, None)) for _ in range(nprocs)]
-    res = [q_out.get() for _ in range(len(sent))]
+        q_in = multiprocessing.Queue(1)
+        q_out = multiprocessing.Queue()
 
-    [p.join() for p in proc]
+        proc = [multiprocessing.Process(target=fun, args=(f, q_in, q_out))
+                for _ in range(nprocs)]
+        for p in proc:
+            p.daemon = True
+            p.start()
 
-    return [x for i, x in sorted(res)]
+        sent = [q_in.put((i, x)) for i, x in enumerate(X)]
+        [q_in.put((None, None)) for _ in range(nprocs)]
+        res = [q_out.get() for _ in range(len(sent))]
+
+        [p.join() for p in proc]
+
+        return [x for i, x in sorted(res)]
+    else:
+        return list(map(f, X))
 
 
 def check_params(**kwargs):
@@ -256,6 +257,7 @@ def check_params(**kwargs):
 
 def check_random_state(seed):
     """Turn seed into a np.random.RandomState instance
+
     Parameters
     ----------
     seed : None | int | instance of RandomState
@@ -275,7 +277,6 @@ def check_random_state(seed):
 
 
 class deprecated(object):
-
     """Decorator to mark a function or class as deprecated.
 
     deprecated class from scikit-learn package
@@ -285,14 +286,14 @@ class deprecated(object):
     The optional extra argument will be appended to the deprecation message
     and the docstring. Note: to use this with the default value for extra, put
     in an empty of parentheses:
-    >>> from ot.deprecation import deprecated
-    >>> @deprecated()
-    ... def some_function(): pass
+    >>> from ot.deprecation import deprecated  # doctest: +SKIP
+    >>> @deprecated()  # doctest: +SKIP
+    ... def some_function(): pass  # doctest: +SKIP
 
     Parameters
     ----------
-    extra : string
-          to be added to the deprecation messages
+    extra : str
+        To be added to the deprecation messages.
     """
 
     # Adapted from http://wiki.python.org/moin/PythonDecoratorLibrary,
@@ -373,9 +374,9 @@ def _is_deprecated(func):
 
 
 class BaseEstimator(object):
-
     """Base class for most objects in POT
-    adapted from sklearn BaseEstimator class
+
+    Code adapted from sklearn BaseEstimator class
 
     Notes
     -----
@@ -417,7 +418,7 @@ class BaseEstimator(object):
 
         Parameters
         ----------
-        deep : boolean, optional
+        deep : bool, optional
             If True, will return the parameters for this estimator and
             contained subobjects that are estimators.
 
@@ -487,3 +488,11 @@ class BaseEstimator(object):
                                      (key, self.__class__.__name__))
                 setattr(self, key, value)
         return self
+
+
+class UndefinedParameter(Exception):
+    """
+    Aim at raising an Exception when a undefined parameter is called
+
+    """
+    pass