18 files changed, 2712 insertions, 306 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index 89c7936..0e6e2e2 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -1,34 +1,5 @@
 """
 
-This is the main module of the POT toolbox. It provides easy access to
-a number of sub-modules and functions described below.
-
-.. note::
-
-
-    Here is a list of the submodules and short description of what they contain.
-
-    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
-    - :any:`ot.bregman` contains OT solvers for the entropic OT problems using
-      Bregman projections.
-    - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems.
-    - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT
-      problems.
-    - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov
-      Wasserstein problems.
-    - :any:`ot.optim` contains generic solvers OT based optimization problems
-    - :any:`ot.da` contains classes and function related to Monge mapping
-      estimation and Domain Adaptation (DA).
-    - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers
-    - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein
-      Discriminant Analysis.
-    - :any:`ot.utils` contains utility functions such as distance computation and
-      timing.
-    - :any:`ot.datasets` contains toy dataset generation functions.
-    - :any:`ot.plot` contains visualization functions
-    - :any:`ot.stochastic` contains stochastic solvers for regularized OT.
-    - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT.
-
 .. warning::
     The list of automatically imported sub-modules is as follows:
     :py:mod:`ot.lp`, :py:mod:`ot.bregman`, :py:mod:`ot.optim`
@@ -61,6 +32,7 @@ from . import gromov
 from . import smooth
 from . import stochastic
 from . import unbalanced
+from . import partial
 
 # OT functions
 from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d
@@ -71,7 +43,7 @@ from .da import sinkhorn_lpl1_mm
 # utils functions
 from .utils import dist, unif, tic, toc, toq
 
-__version__ = "0.6.0"
+__version__ = "0.7.0"
 
 __all__ = ['emd', 'emd2', 'emd_1d', 'sinkhorn', 'sinkhorn2', 'utils', 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
diff --git a/ot/bregman.py b/ot/bregman.py
index 2cd832b..f1f8437 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Bregman projections for regularized OT
+Bregman projections solvers for entropic regularized OT
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -8,12 +8,16 @@ Bregman projections for regularized OT
 #         Kilian Fatras <kilian.fatras@irisa.fr>
 #         Titouan Vayer <titouan.vayer@irisa.fr>
 #         Hicham Janati <hicham.janati@inria.fr>
+#         Mokhtar Z. Alaya <mokhtarzahdi.alaya@gmail.com>
+#         Alexander Tong <alexander.tong@yale.edu>
+#         Ievgen Redko <ievgen.redko@univ-st-etienne.fr>
 #
 # License: MIT License
 
 import numpy as np
 import warnings
 from .utils import unif, dist
+from scipy.optimize import fmin_l_bfgs_b
 
 
 def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
@@ -536,12 +540,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
             old_v = v[i_2]
             v[i_2] = b[i_2] / (K[:, i_2].T.dot(u))
             G[:, i_2] = u * K[:, i_2] * v[i_2]
-            #aviol = (G@one_m - a)
-            #aviol_2 = (G.T@one_n - b)
+            # aviol = (G@one_m - a)
+            # aviol_2 = (G.T@one_n - b)
             viol += (-old_v + v[i_2]) * K[:, i_2] * u
             viol_2[i_2] = v[i_2] * K[:, i_2].dot(u) - b[i_2]
 
-            #print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2)))
+            # print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2)))
 
         if stopThr_val <= stopThr:
             break
@@ -905,11 +909,6 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
     else:
         alpha, beta = warmstart
 
-    def get_K(alpha, beta):
-        """log space computation"""
-        return np.exp(-(M - alpha.reshape((dim_a, 1))
-                        - beta.reshape((1, dim_b))) / reg)
-
     # print(np.min(K))
     def get_reg(n):  # exponential decreasing
         return (epsilon0 - reg) * np.exp(-n) + reg
@@ -937,7 +936,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
             # the 10th iterations
             transp = G
             err = np.linalg.norm(
-                (np.sum(transp, axis=0) - b))**2 + np.linalg.norm((np.sum(transp, axis=1) - a))**2
+                (np.sum(transp, axis=0) - b)) ** 2 + np.linalg.norm((np.sum(transp, axis=1) - a)) ** 2
             if log:
                 log['err'].append(err)
 
@@ -963,7 +962,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
 
 def geometricBar(weights, alldistribT):
     """return the weighted geometric mean of distributions"""
-    assert(len(weights) == alldistribT.shape[1])
+    assert (len(weights) == alldistribT.shape[1])
     return np.exp(np.dot(np.log(alldistribT), weights.T))
 
 
@@ -1037,11 +1036,13 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
     """
 
     if method.lower() == 'sinkhorn':
-        return barycenter_sinkhorn(A, M, reg, numItermax=numItermax,
+        return barycenter_sinkhorn(A, M, reg, weights=weights,
+                                   numItermax=numItermax,
                                    stopThr=stopThr, verbose=verbose, log=log,
                                    **kwargs)
     elif method.lower() == 'sinkhorn_stabilized':
-        return barycenter_stabilized(A, M, reg, numItermax=numItermax,
+        return barycenter_stabilized(A, M, reg, weights=weights,
+                                     numItermax=numItermax,
                                      stopThr=stopThr, verbose=verbose,
                                      log=log, **kwargs)
     else:
@@ -1103,7 +1104,7 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
     if weights is None:
         weights = np.ones(A.shape[1]) / A.shape[1]
     else:
-        assert(len(weights) == A.shape[1])
+        assert (len(weights) == A.shape[1])
 
     if log:
         log = {'err': []}
@@ -1201,7 +1202,7 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
     if weights is None:
         weights = np.ones(n_hists) / n_hists
     else:
-        assert(len(weights) == A.shape[1])
+        assert (len(weights) == A.shape[1])
 
     if log:
         log = {'err': []}
@@ -1329,7 +1330,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
     if weights is None:
         weights = np.ones(A.shape[0]) / A.shape[0]
     else:
-        assert(len(weights) == A.shape[0])
+        assert (len(weights) == A.shape[0])
 
     if log:
         log = {'err': []}
@@ -1342,12 +1343,17 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
     err = 1
 
     # build the convolution operator
+    # this is equivalent to blurring on horizontal then vertical directions
     t = np.linspace(0, 1, A.shape[1])
     [Y, X] = np.meshgrid(t, t)
-    xi1 = np.exp(-(X - Y)**2 / reg)
+    xi1 = np.exp(-(X - Y) ** 2 / reg)
+
+    t = np.linspace(0, 1, A.shape[2])
+    [Y, X] = np.meshgrid(t, t)
+    xi2 = np.exp(-(X - Y) ** 2 / reg)
 
     def K(x):
-        return np.dot(np.dot(xi1, x), xi1)
+        return np.dot(np.dot(xi1, x), xi2)
 
     while (err > stopThr and cpt < numItermax):
 
@@ -1492,6 +1498,164 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
         return np.sum(K0, axis=1)
 
 
+def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100,
+                     stopThr=1e-6, verbose=False, log=False, **kwargs):
+    r'''Joint OT and proportion estimation for multi-source target shift as proposed in [27]
+
+    The function solves the following optimization problem:
+
+    .. math::
+
+        \mathbf{h} = arg\min_{\mathbf{h}}\quad \sum_{k=1}^{K} \lambda_k
+                    W_{reg}((\mathbf{D}_2^{(k)} \mathbf{h})^T, \mathbf{a})
+
+        s.t. \ \forall k, \mathbf{D}_1^{(k)} \gamma_k \mathbf{1}_n= \mathbf{h}
+
+    where :
+
+    - :math:`\lambda_k` is the weight of k-th source domain
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn)
+    - :math:`\mathbf{D}_2^{(k)}` is a matrix of weights related to k-th source domain defined as in [p. 5, 27], its expected shape is `(n_k, C)` where `n_k` is the number of elements in the k-th source domain and `C` is the number of classes
+    - :math:`\mathbf{h}` is a vector of estimated proportions in the target domain of size C
+    - :math:`\mathbf{a}` is a uniform vector of weights in the target domain of size `n`
+    - :math:`\mathbf{D}_1^{(k)}` is a matrix of class assignments defined as in [p. 5, 27], its expected shape is `(n_k, C)`
+
+    The problem consist in solving a Wasserstein barycenter problem to estimate the proportions :math:`\mathbf{h}` in the target domain.
+
+    The algorithm used for solving the problem is the Iterative Bregman projections algorithm
+    with two sets of marginal constraints related to the unknown vector :math:`\mathbf{h}` and uniform target distribution.
+
+    Parameters
+    ----------
+    Xs : list of K np.ndarray(nsk,d)
+        features of all source domains' samples
+    Ys : list of K np.ndarray(nsk,)
+        labels of all source domains' samples
+    Xt : np.ndarray (nt,d)
+        samples in the target domain
+    reg : float
+        Regularization term > 0
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshold on relative change in the barycenter (>0)
+    log : bool, optional
+        record log if True
+    verbose : bool, optional (default=False)
+        Controls the verbosity of the optimization algorithm
+
+    Returns
+    -------
+    h : (C,) ndarray
+        proportion estimation in the target domain
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
+       "Optimal transport for multi-source domain adaptation under target shift",
+       International Conference on Artificial Intelligence and Statistics (AISTATS), 2019.
+
+    '''
+    nbclasses = len(np.unique(Ys[0]))
+    nbdomains = len(Xs)
+
+    # log dictionary
+    if log:
+        log = {'niter': 0, 'err': [], 'M': [], 'D1': [], 'D2': [], 'gamma': []}
+
+    K = []
+    M = []
+    D1 = []
+    D2 = []
+
+    # For each source domain, build cost matrices M, Gibbs kernels K and corresponding matrices D_1 and D_2
+    for d in range(nbdomains):
+        dom = {}
+        nsk = Xs[d].shape[0]  # get number of elements for this domain
+        dom['nbelem'] = nsk
+        classes = np.unique(Ys[d])  # get number of classes for this domain
+
+        # format classes to start from 0 for convenience
+        if np.min(classes) != 0:
+            Ys[d] = Ys[d] - np.min(classes)
+            classes = np.unique(Ys[d])
+
+        # build the corresponding D_1 and D_2 matrices
+        Dtmp1 = np.zeros((nbclasses, nsk))
+        Dtmp2 = np.zeros((nbclasses, nsk))
+
+        for c in classes:
+            nbelemperclass = np.sum(Ys[d] == c)
+            if nbelemperclass != 0:
+                Dtmp1[int(c), Ys[d] == c] = 1.
+                Dtmp2[int(c), Ys[d] == c] = 1. / (nbelemperclass)
+        D1.append(Dtmp1)
+        D2.append(Dtmp2)
+
+        # build the cost matrix and the Gibbs kernel
+        Mtmp = dist(Xs[d], Xt, metric=metric)
+        M.append(Mtmp)
+
+        Ktmp = np.empty(Mtmp.shape, dtype=Mtmp.dtype)
+        np.divide(Mtmp, -reg, out=Ktmp)
+        np.exp(Ktmp, out=Ktmp)
+        K.append(Ktmp)
+
+    # uniform target distribution
+    a = unif(np.shape(Xt)[0])
+
+    cpt = 0  # iterations count
+    err = 1
+    old_bary = np.ones((nbclasses))
+
+    while (err > stopThr and cpt < numItermax):
+
+        bary = np.zeros((nbclasses))
+
+        # update coupling matrices for marginal constraints w.r.t. uniform target distribution
+        for d in range(nbdomains):
+            K[d] = projC(K[d], a)
+            other = np.sum(K[d], axis=1)
+            bary = bary + np.log(np.dot(D1[d], other)) / nbdomains
+
+        bary = np.exp(bary)
+
+        # update coupling matrices for marginal constraints w.r.t. unknown proportions based on [Prop 4., 27]
+        for d in range(nbdomains):
+            new = np.dot(D2[d].T, bary)
+            K[d] = projR(K[d], new)
+
+        err = np.linalg.norm(bary - old_bary)
+        cpt = cpt + 1
+        old_bary = bary
+
+        if log:
+            log['err'].append(err)
+
+        if verbose:
+            if cpt % 200 == 0:
+                print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+    bary = bary / np.sum(bary)
+
+    if log:
+        log['niter'] = cpt
+        log['M'] = M
+        log['D1'] = D1
+        log['D2'] = D2
+        log['gamma'] = K
+        return bary, log
+    else:
+        return bary
+
+
 def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
                        numIterMax=10000, stopThr=1e-9, verbose=False,
                        log=False, **kwargs):
@@ -1583,7 +1747,8 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
         return pi
 
 
-def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9,
+                        verbose=False, log=False, **kwargs):
     r'''
     Solve the entropic regularization optimal transport problem from empirical
     data and return the OT loss
@@ -1665,14 +1830,17 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
     M = dist(X_s, X_t, metric=metric)
 
     if log:
-        sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log,
+                                       **kwargs)
         return sinkhorn_loss, log
     else:
-        sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log,
+                                  **kwargs)
         return sinkhorn_loss
 
 
-def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9,
+                                  verbose=False, log=False, **kwargs):
     r'''
     Compute the sinkhorn divergence loss from empirical data
 
@@ -1758,11 +1926,14 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
     .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences,  Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018
     '''
     if log:
-        sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax,
+                                                       stopThr=1e-9, verbose=verbose, log=log, **kwargs)
 
-        sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax,
+                                                     stopThr=1e-9, verbose=verbose, log=log, **kwargs)
 
-        sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax,
+                                                     stopThr=1e-9, verbose=verbose, log=log, **kwargs)
 
         sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
 
@@ -1777,11 +1948,354 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
         return max(0, sinkhorn_div), log
 
     else:
-        sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9,
+                                               verbose=verbose, log=log, **kwargs)
 
-        sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9,
+                                              verbose=verbose, log=log, **kwargs)
 
-        sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+        sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9,
+                                              verbose=verbose, log=log, **kwargs)
 
         sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
         return max(0, sinkhorn_div)
+
+
+def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, restricted=True,
+                maxiter=10000, maxfun=10000, pgtol=1e-09, verbose=False, log=False):
+    r""""
+    Screening Sinkhorn Algorithm for Regularized Optimal Transport
+
+    The function solves an approximate dual of Sinkhorn divergence [2] which is written as the following optimization problem:
+
+    ..math::
+      (u, v) = \argmin_{u, v} 1_{ns}^T B(u,v) 1_{nt} - <\kappa u, a> - <v/\kappa, b>
+
+      where B(u,v) = \diag(e^u) K \diag(e^v), with K = e^{-M/reg} and
+
+      s.t. e^{u_i} \geq \epsilon / \kappa, for all i \in {1, ..., ns}
+
+           e^{v_j} \geq \epsilon \kappa, for all j \in {1, ..., nt}
+
+      The parameters \kappa and \epsilon are determined w.r.t the couple number budget of points (ns_budget, nt_budget), see Equation (5) in [26]
+
+
+    Parameters
+    ----------
+    a : `numpy.ndarray`, shape=(ns,)
+        samples weights in the source domain
+
+    b : `numpy.ndarray`, shape=(nt,)
+        samples weights in the target domain
+
+    M : `numpy.ndarray`, shape=(ns, nt)
+        Cost matrix
+
+    reg : `float`
+        Level of the entropy regularisation
+
+    ns_budget : `int`, deafult=None
+        Number budget of points to be keeped in the source domain
+        If it is None then 50% of the source sample points will be keeped
+
+    nt_budget : `int`, deafult=None
+        Number budget of points to be keeped in the target domain
+        If it is None then 50% of the target sample points will be keeped
+
+    uniform : `bool`, default=False
+        If `True`, the source and target distribution are supposed to be uniform, i.e., a_i = 1 / ns and b_j = 1 / nt
+
+    restricted : `bool`, default=True
+         If `True`, a warm-start initialization for the  L-BFGS-B solver
+         using a restricted Sinkhorn algorithm with at most 5 iterations
+
+    maxiter : `int`, default=10000
+      Maximum number of iterations in LBFGS solver
+
+    maxfun : `int`, default=10000
+      Maximum  number of function evaluations in LBFGS solver
+
+    pgtol : `float`, default=1e-09
+      Final objective function accuracy in LBFGS solver
+
+    verbose : `bool`, default=False
+        If `True`, dispaly informations about the cardinals of the active sets and the paramerters kappa
+        and epsilon
+
+    Dependency
+    ----------
+    To gain more efficiency, screenkhorn needs to call the "Bottleneck" package (https://pypi.org/project/Bottleneck/)
+    in the screening pre-processing step. If Bottleneck isn't installed, the following error message appears:
+    "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/"
+
+
+    Returns
+    -------
+    gamma : `numpy.ndarray`, shape=(ns, nt)
+        Screened optimal transportation matrix for the given parameters
+
+    log : `dict`, default=False
+      Log dictionary return only if log==True in parameters
+
+
+    References
+    -----------
+    .. [26] Alaya M. Z., Bérar M., Gasso G., Rakotomamonjy A. (2019). Screening Sinkhorn Algorithm for Regularized Optimal Transport (NIPS) 33, 2019
+
+    """
+    # check if bottleneck module exists
+    try:
+        import bottleneck
+    except ImportError:
+        warnings.warn(
+            "Bottleneck module is not installed. Install it from https://pypi.org/project/Bottleneck/ for better performance.")
+        bottleneck = np
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+    ns, nt = M.shape
+
+    # by default, we keep only 50% of the sample data points
+    if ns_budget is None:
+        ns_budget = int(np.floor(0.5 * ns))
+    if nt_budget is None:
+        nt_budget = int(np.floor(0.5 * nt))
+
+    # calculate the Gibbs kernel
+    K = np.empty_like(M)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    def projection(u, epsilon):
+        u[u <= epsilon] = epsilon
+        return u
+
+    # ----------------------------------------------------------------------------------------------------------------#
+    #                                          Step 1: Screening pre-processing                                       #
+    # ----------------------------------------------------------------------------------------------------------------#
+
+    if ns_budget == ns and nt_budget == nt:
+        # full number of budget points (ns, nt) = (ns_budget, nt_budget)
+        Isel = np.ones(ns, dtype=bool)
+        Jsel = np.ones(nt, dtype=bool)
+        epsilon = 0.0
+        kappa = 1.0
+
+        cst_u = 0.
+        cst_v = 0.
+
+        bounds_u = [(0.0, np.inf)] * ns
+        bounds_v = [(0.0, np.inf)] * nt
+
+        a_I = a
+        b_J = b
+        K_IJ = K
+        K_IJc = []
+        K_IcJ = []
+
+        vec_eps_IJc = np.zeros(nt)
+        vec_eps_IcJ = np.zeros(ns)
+
+    else:
+        # sum of rows and columns of K
+        K_sum_cols = K.sum(axis=1)
+        K_sum_rows = K.sum(axis=0)
+
+        if uniform:
+            if ns / ns_budget < 4:
+                aK_sort = np.sort(K_sum_cols)
+                epsilon_u_square = a[0] / aK_sort[ns_budget - 1]
+            else:
+                aK_sort = bottleneck.partition(K_sum_cols, ns_budget - 1)[ns_budget - 1]
+                epsilon_u_square = a[0] / aK_sort
+
+            if nt / nt_budget < 4:
+                bK_sort = np.sort(K_sum_rows)
+                epsilon_v_square = b[0] / bK_sort[nt_budget - 1]
+            else:
+                bK_sort = bottleneck.partition(K_sum_rows, nt_budget - 1)[nt_budget - 1]
+                epsilon_v_square = b[0] / bK_sort
+        else:
+            aK = a / K_sum_cols
+            bK = b / K_sum_rows
+
+            aK_sort = np.sort(aK)[::-1]
+            epsilon_u_square = aK_sort[ns_budget - 1]
+
+            bK_sort = np.sort(bK)[::-1]
+            epsilon_v_square = bK_sort[nt_budget - 1]
+
+        # active sets I and J (see Lemma 1 in [26])
+        Isel = a >= epsilon_u_square * K_sum_cols
+        Jsel = b >= epsilon_v_square * K_sum_rows
+
+        if sum(Isel) != ns_budget:
+            if uniform:
+                aK = a / K_sum_cols
+                aK_sort = np.sort(aK)[::-1]
+            epsilon_u_square = aK_sort[ns_budget - 1:ns_budget + 1].mean()
+            Isel = a >= epsilon_u_square * K_sum_cols
+            ns_budget = sum(Isel)
+
+        if sum(Jsel) != nt_budget:
+            if uniform:
+                bK = b / K_sum_rows
+                bK_sort = np.sort(bK)[::-1]
+            epsilon_v_square = bK_sort[nt_budget - 1:nt_budget + 1].mean()
+            Jsel = b >= epsilon_v_square * K_sum_rows
+            nt_budget = sum(Jsel)
+
+        epsilon = (epsilon_u_square * epsilon_v_square) ** (1 / 4)
+        kappa = (epsilon_v_square / epsilon_u_square) ** (1 / 2)
+
+        if verbose:
+            print("epsilon = %s\n" % epsilon)
+            print("kappa = %s\n" % kappa)
+            print('Cardinality of selected points: |Isel| = %s \t |Jsel| = %s \n' % (sum(Isel), sum(Jsel)))
+
+        # Ic, Jc: complementary of the active sets I and J
+        Ic = ~Isel
+        Jc = ~Jsel
+
+        K_IJ = K[np.ix_(Isel, Jsel)]
+        K_IcJ = K[np.ix_(Ic, Jsel)]
+        K_IJc = K[np.ix_(Isel, Jc)]
+
+        K_min = K_IJ.min()
+        if K_min == 0:
+            K_min = np.finfo(float).tiny
+
+        # a_I, b_J, a_Ic, b_Jc
+        a_I = a[Isel]
+        b_J = b[Jsel]
+        if not uniform:
+            a_I_min = a_I.min()
+            a_I_max = a_I.max()
+            b_J_max = b_J.max()
+            b_J_min = b_J.min()
+        else:
+            a_I_min = a_I[0]
+            a_I_max = a_I[0]
+            b_J_max = b_J[0]
+            b_J_min = b_J[0]
+
+        # box constraints in L-BFGS-B (see Proposition 1 in [26])
+        bounds_u = [(max(a_I_min / ((nt - nt_budget) * epsilon + nt_budget * (b_J_max / (
+            ns * epsilon * kappa * K_min))), epsilon / kappa), a_I_max / (nt * epsilon * K_min))] * ns_budget
+
+        bounds_v = [(
+                    max(b_J_min / ((ns - ns_budget) * epsilon + ns_budget * (kappa * a_I_max / (nt * epsilon * K_min))),
+                        epsilon * kappa), b_J_max / (ns * epsilon * K_min))] * nt_budget
+
+        # pre-calculated constants for the objective
+        vec_eps_IJc = epsilon * kappa * (K_IJc * np.ones(nt - nt_budget).reshape((1, -1))).sum(axis=1)
+        vec_eps_IcJ = (epsilon / kappa) * (np.ones(ns - ns_budget).reshape((-1, 1)) * K_IcJ).sum(axis=0)
+
+    # initialisation
+    u0 = np.full(ns_budget, (1. / ns_budget) + epsilon / kappa)
+    v0 = np.full(nt_budget, (1. / nt_budget) + epsilon * kappa)
+
+    # pre-calculed constants for Restricted Sinkhorn (see Algorithm 1 in supplementary of [26])
+    if restricted:
+        if ns_budget != ns or nt_budget != nt:
+            cst_u = kappa * epsilon * K_IJc.sum(axis=1)
+            cst_v = epsilon * K_IcJ.sum(axis=0) / kappa
+
+        cpt = 1
+        while cpt < 5:  # 5 iterations
+            K_IJ_v = np.dot(K_IJ.T, u0) + cst_v
+            v0 = b_J / (kappa * K_IJ_v)
+            KIJ_u = np.dot(K_IJ, v0) + cst_u
+            u0 = (kappa * a_I) / KIJ_u
+            cpt += 1
+
+        u0 = projection(u0, epsilon / kappa)
+        v0 = projection(v0, epsilon * kappa)
+
+    else:
+        u0 = u0
+        v0 = v0
+
+    def restricted_sinkhorn(usc, vsc, max_iter=5):
+        """
+        Restricted Sinkhorn Algorithm as a warm-start initialized point for L-BFGS-B (see Algorithm 1 in supplementary of [26])
+        """
+        cpt = 1
+        while cpt < max_iter:
+            K_IJ_v = np.dot(K_IJ.T, usc) + cst_v
+            vsc = b_J / (kappa * K_IJ_v)
+            KIJ_u = np.dot(K_IJ, vsc) + cst_u
+            usc = (kappa * a_I) / KIJ_u
+            cpt += 1
+
+        usc = projection(usc, epsilon / kappa)
+        vsc = projection(vsc, epsilon * kappa)
+
+        return usc, vsc
+
+    def screened_obj(usc, vsc):
+        part_IJ = np.dot(np.dot(usc, K_IJ), vsc) - kappa * np.dot(a_I, np.log(usc)) - (1. / kappa) * np.dot(b_J,
+                                                                                                            np.log(vsc))
+        part_IJc = np.dot(usc, vec_eps_IJc)
+        part_IcJ = np.dot(vec_eps_IcJ, vsc)
+        psi_epsilon = part_IJ + part_IJc + part_IcJ
+        return psi_epsilon
+
+    def screened_grad(usc, vsc):
+        # gradients of Psi_(kappa,epsilon) w.r.t u and v
+        grad_u = np.dot(K_IJ, vsc) + vec_eps_IJc - kappa * a_I / usc
+        grad_v = np.dot(K_IJ.T, usc) + vec_eps_IcJ - (1. / kappa) * b_J / vsc
+        return grad_u, grad_v
+
+    def bfgspost(theta):
+        u = theta[:ns_budget]
+        v = theta[ns_budget:]
+        # objective
+        f = screened_obj(u, v)
+        # gradient
+        g_u, g_v = screened_grad(u, v)
+        g = np.hstack([g_u, g_v])
+        return f, g
+
+    # ----------------------------------------------------------------------------------------------------------------#
+    #                                           Step 2: L-BFGS-B solver                                              #
+    # ----------------------------------------------------------------------------------------------------------------#
+
+    u0, v0 = restricted_sinkhorn(u0, v0)
+    theta0 = np.hstack([u0, v0])
+
+    bounds = bounds_u + bounds_v  # constraint bounds
+
+    def obj(theta):
+        return bfgspost(theta)
+
+    theta, _, _ = fmin_l_bfgs_b(func=obj,
+                                x0=theta0,
+                                bounds=bounds,
+                                maxfun=maxfun,
+                                pgtol=pgtol,
+                                maxiter=maxiter)
+
+    usc = theta[:ns_budget]
+    vsc = theta[ns_budget:]
+
+    usc_full = np.full(ns, epsilon / kappa)
+    vsc_full = np.full(nt, epsilon * kappa)
+    usc_full[Isel] = usc
+    vsc_full[Jsel] = vsc
+
+    if log:
+        log = {}
+        log['u'] = usc_full
+        log['v'] = vsc_full
+        log['Isel'] = Isel
+        log['Jsel'] = Jsel
+
+    gamma = usc_full[:, None] * K * vsc_full[None, :]
+    gamma = gamma / gamma.sum()
+
+    if log:
+        return gamma, log
+    else:
+        return gamma
diff --git a/ot/da.py b/ot/da.py
index 108a38d..b881a8b 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -7,15 +7,16 @@ Domain adaptation with optimal transport
 #         Nicolas Courty <ncourty@irisa.fr>
 #         Michael Perrot <michael.perrot@univ-st-etienne.fr>
 #         Nathalie Gayraud <nat.gayraud@gmail.com>
+#         Ievgen Redko <ievgen.redko@univ-st-etienne.fr>
 #
 # License: MIT License
 
 import numpy as np
 import scipy.linalg as linalg
 
-from .bregman import sinkhorn
+from .bregman import sinkhorn, jcpot_barycenter
 from .lp import emd
-from .utils import unif, dist, kernel, cost_normalization
+from .utils import unif, dist, kernel, cost_normalization, label_normalization, laplacian, dots
 from .utils import check_params, BaseEstimator
 from .unbalanced import sinkhorn_unbalanced
 from .optim import cg
@@ -127,7 +128,7 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
         W = np.ones(M.shape)
         for (i, c) in enumerate(classes):
             majs = np.sum(transp[indices_labels[i]], axis=0)
-            majs = p * ((majs + epsilon)**(p - 1))
+            majs = p * ((majs + epsilon) ** (p - 1))
             W[indices_labels[i]] = majs
 
     return transp
@@ -359,8 +360,8 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False,
 
     def loss(L, G):
         """Compute full loss"""
-        return np.sum((xs1.dot(L) - ns * G.dot(xt))**2) + mu * \
-            np.sum(G * M) + eta * np.sum(sel(L - I0)**2)
+        return np.sum((xs1.dot(L) - ns * G.dot(xt)) ** 2) + mu * \
+            np.sum(G * M) + eta * np.sum(sel(L - I0) ** 2)
 
     def solve_L(G):
         """ solve L problem with fixed G (least square)"""
@@ -372,10 +373,11 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False,
         xsi = xs1.dot(L)
 
         def f(G):
-            return np.sum((xsi - ns * G.dot(xt))**2)
+            return np.sum((xsi - ns * G.dot(xt)) ** 2)
 
         def df(G):
             return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T)
+
         G = cg(a, b, M, 1.0 / mu, f, df, G0=G0,
                numItermax=numInnerItermax, stopThr=stopInnerThr)
         return G
@@ -562,7 +564,7 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
 
     def loss(L, G):
         """Compute full loss"""
-        return np.sum((K1.dot(L) - ns * G.dot(xt))**2) + mu * \
+        return np.sum((K1.dot(L) - ns * G.dot(xt)) ** 2) + mu * \
             np.sum(G * M) + eta * np.trace(L.T.dot(Kreg).dot(L))
 
     def solve_L_nobias(G):
@@ -580,10 +582,11 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
         xsi = K1.dot(L)
 
         def f(G):
-            return np.sum((xsi - ns * G.dot(xt))**2)
+            return np.sum((xsi - ns * G.dot(xt)) ** 2)
 
         def df(G):
             return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T)
+
         G = cg(a, b, M, 1.0 / mu, f, df, G0=G0,
                numItermax=numInnerItermax, stopThr=stopInnerThr)
         return G
@@ -745,6 +748,139 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
         return A, b
 
 
+def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, alpha=.5,
+                numItermax=100, stopThr=1e-9, numInnerItermax=100000,
+                stopInnerThr=1e-9, log=False, verbose=False):
+    r"""Solve the optimal transport problem (OT) with Laplacian regularization
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + eta\Omega_\alpha(\gamma)
+
+        s.t.\ \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+
+    where:
+
+    - a and b are source and target weights (sum to 1)
+    - xs and xt are source and target samples
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega_\alpha` is the Laplacian regularization term
+      :math:`\Omega_\alpha = (1-\alpha)/n_s^2\sum_{i,j}S^s_{i,j}\|T(\mathbf{x}^s_i)-T(\mathbf{x}^s_j)\|^2+\alpha/n_t^2\sum_{i,j}S^t_{i,j}^'\|T(\mathbf{x}^t_i)-T(\mathbf{x}^t_j)\|^2`
+      with :math:`S^s_{i,j}, S^t_{i,j}` denoting source and target similarity matrices and :math:`T(\cdot)` being a barycentric mapping
+
+    The algorithm used for solving the problem is the conditional gradient algorithm as proposed in [5].
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,)
+        samples weights in the target domain
+    xs : np.ndarray (ns,d)
+        samples in the source domain
+    xt : np.ndarray (nt,d)
+        samples in the target domain
+    M : np.ndarray (ns,nt)
+        loss matrix
+    sim : string, optional
+        Type of similarity ('knn' or 'gauss') used to construct the Laplacian.
+    sim_param : int or float, optional
+        Parameter (number of the nearest neighbors for sim='knn'
+        or bandwidth for sim='gauss') used to compute the Laplacian.
+    reg : string
+        Type of Laplacian regularization
+    eta : float
+        Regularization term for Laplacian regularization
+    alpha : float
+        Regularization term  for source domain's importance in regularization
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshold on error (inner emd solver) (>0)
+    numInnerItermax : int, optional
+        Max number of iterations (inner CG solver)
+    stopInnerThr : float, optional
+        Stop threshold on error (inner CG solver) (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [30] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy,
+        "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching,"
+         in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+    if not isinstance(sim_param, (int, float, type(None))):
+        raise ValueError(
+            'Similarity parameter should be an int or a float. Got {type} instead.'.format(type=type(sim_param).__name__))
+
+    if sim == 'gauss':
+        if sim_param is None:
+            sim_param = 1 / (2 * (np.mean(dist(xs, xs, 'sqeuclidean')) ** 2))
+        sS = kernel(xs, xs, method=sim, sigma=sim_param)
+        sT = kernel(xt, xt, method=sim, sigma=sim_param)
+
+    elif sim == 'knn':
+        if sim_param is None:
+            sim_param = 3
+
+        from sklearn.neighbors import kneighbors_graph
+
+        sS = kneighbors_graph(X=xs, n_neighbors=int(sim_param)).toarray()
+        sS = (sS + sS.T) / 2
+        sT = kneighbors_graph(xt, n_neighbors=int(sim_param)).toarray()
+        sT = (sT + sT.T) / 2
+    else:
+        raise ValueError('Unknown similarity type {sim}. Currently supported similarity types are "knn" and "gauss".'.format(sim=sim))
+
+    lS = laplacian(sS)
+    lT = laplacian(sT)
+
+    def f(G):
+        return alpha * np.trace(np.dot(xt.T, np.dot(G.T, np.dot(lS, np.dot(G, xt))))) \
+            + (1 - alpha) * np.trace(np.dot(xs.T, np.dot(G, np.dot(lT, np.dot(G.T, xs)))))
+
+    ls2 = lS + lS.T
+    lt2 = lT + lT.T
+    xt2 = np.dot(xt, xt.T)
+
+    if reg == 'disp':
+        Cs = -eta * alpha / xs.shape[0] * dots(ls2, xs, xt.T)
+        Ct = -eta * (1 - alpha) / xt.shape[0] * dots(xs, xt.T, lt2)
+        M = M + Cs + Ct
+
+    def df(G):
+        return alpha * np.dot(ls2, np.dot(G, xt2))\
+            + (1 - alpha) * np.dot(xs, np.dot(xs.T, np.dot(G, lt2)))
+
+    return cg(a, b, M, reg=eta, f=f, df=df, G0=None, numItermax=numItermax, numItermaxEmd=numInnerItermax,
+              stopThr=stopThr, stopThr2=stopInnerThr, verbose=verbose, log=log)
+
+
 def distribution_estimation_uniform(X):
     """estimates a uniform distribution from an array of samples X
 
@@ -772,7 +908,8 @@ class BaseTransport(BaseEstimator):
     at the class level in their ``__init__`` as explicit keyword
     arguments (no ``*args`` or ``**kwargs``).
 
-    fit method should:
+    the fit method should:
+
     - estimate a cost matrix and store it in a `cost_` attribute
     - estimate a coupling matrix and store it in a `coupling_`
     attribute
@@ -783,6 +920,9 @@ class BaseTransport(BaseEstimator):
 
     transform method should always get as input a Xs parameter
     inverse_transform method should always get as input a Xt parameter
+
+    transform_labels method should always get as input a ys parameter
+    inverse_transform_labels method should always get as input a yt parameter
     """
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
@@ -794,7 +934,7 @@ class BaseTransport(BaseEstimator):
         Xs : array-like, shape (n_source_samples, n_features)
             The training input samples.
         ys : array-like, shape (n_source_samples,)
-            The class labels
+            The training class labels
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
@@ -855,7 +995,7 @@ class BaseTransport(BaseEstimator):
         Xs : array-like, shape (n_source_samples, n_features)
             The training input samples.
         ys : array-like, shape (n_source_samples,)
-            The class labels
+            The class labels for training samples
         Xt : array-like, shape (n_target_samples, n_features)
             The training input samples.
         yt : array-like, shape (n_target_samples,)
@@ -879,13 +1019,13 @@ class BaseTransport(BaseEstimator):
         Parameters
         ----------
         Xs : array-like, shape (n_source_samples, n_features)
-            The training input samples.
+            The source input samples.
         ys : array-like, shape (n_source_samples,)
-            The class labels
+            The class labels for source samples
         Xt : array-like, shape (n_target_samples, n_features)
-            The training input samples.
+            The target input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
+            The class labels for target. If some target samples are unlabeled, fill the
             yt's elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
@@ -921,7 +1061,6 @@ class BaseTransport(BaseEstimator):
 
                 transp_Xs = []
                 for bi in batch_ind:
-
                     # get the nearest neighbor in the source domain
                     D0 = dist(Xs[bi], self.xs_)
                     idx = np.argmin(D0, axis=1)
@@ -941,20 +1080,64 @@ class BaseTransport(BaseEstimator):
 
             return transp_Xs
 
+    def transform_labels(self, ys=None):
+        """Propagate source labels ys to obtain estimated target labels as in [27]
+
+        Parameters
+        ----------
+        ys : array-like, shape (n_source_samples,)
+            The source class labels
+
+        Returns
+        -------
+        transp_ys : array-like, shape (n_target_samples, nb_classes)
+            Estimated soft target labels.
+
+        References
+        ----------
+
+        .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
+           "Optimal transport for multi-source domain adaptation under target shift",
+           International Conference on Artificial Intelligence and Statistics (AISTATS), 2019.
+
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(ys=ys):
+
+            ysTemp = label_normalization(np.copy(ys))
+            classes = np.unique(ysTemp)
+            n = len(classes)
+            D1 = np.zeros((n, len(ysTemp)))
+
+            # perform label propagation
+            transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None]
+
+            # set nans to 0
+            transp[~ np.isfinite(transp)] = 0
+
+            for c in classes:
+                D1[int(c), ysTemp == c] = 1
+
+            # compute propagated labels
+            transp_ys = np.dot(D1, transp)
+
+            return transp_ys.T
+
     def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
                           batch_size=128):
-        """Transports target samples Xt onto target samples Xs
+        """Transports target samples Xt onto source samples Xs
 
         Parameters
         ----------
         Xs : array-like, shape (n_source_samples, n_features)
-            The training input samples.
+            The source input samples.
         ys : array-like, shape (n_source_samples,)
-            The class labels
+            The source class labels
         Xt : array-like, shape (n_target_samples, n_features)
-            The training input samples.
+            The target input samples.
         yt : array-like, shape (n_target_samples,)
-            The class labels. If some target samples are unlabeled, fill the
+            The target class labels. If some target samples are unlabeled, fill the
             yt's elements with -1.
 
             Warning: Note that, due to this convention -1 cannot be used as a
@@ -990,7 +1173,6 @@ class BaseTransport(BaseEstimator):
 
                 transp_Xt = []
                 for bi in batch_ind:
-
                     D0 = dist(Xt[bi], self.xt_)
                     idx = np.argmin(D0, axis=1)
 
@@ -1009,6 +1191,41 @@ class BaseTransport(BaseEstimator):
 
             return transp_Xt
 
+    def inverse_transform_labels(self, yt=None):
+        """Propagate target labels yt to obtain estimated source labels ys
+
+        Parameters
+        ----------
+        yt : array-like, shape (n_target_samples,)
+
+        Returns
+        -------
+        transp_ys : array-like, shape (n_source_samples, nb_classes)
+            Estimated soft source labels.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(yt=yt):
+
+            ytTemp = label_normalization(np.copy(yt))
+            classes = np.unique(ytTemp)
+            n = len(classes)
+            D1 = np.zeros((n, len(ytTemp)))
+
+            # perform label propagation
+            transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None]
+
+            # set nans to 0
+            transp[~ np.isfinite(transp)] = 0
+
+            for c in classes:
+                D1[int(c), ytTemp == c] = 1
+
+            # compute propagated samples
+            transp_ys = np.dot(D1, transp.T)
+
+            return transp_ys.T
+
 
 class LinearTransport(BaseTransport):
     """ OT linear operator between empirical distributions
@@ -1055,7 +1272,6 @@ class LinearTransport(BaseTransport):
 
     def __init__(self, reg=1e-8, bias=True, log=False,
                  distribution_estimation=distribution_estimation_uniform):
-
         self.bias = bias
         self.log = log
         self.reg = reg
@@ -1136,7 +1352,6 @@ class LinearTransport(BaseTransport):
 
         # check the necessary inputs parameters are here
         if check_params(Xs=Xs):
-
             transp_Xs = Xs.dot(self.A_) + self.B_
 
             return transp_Xs
@@ -1170,7 +1385,6 @@ class LinearTransport(BaseTransport):
 
         # check the necessary inputs parameters are here
         if check_params(Xt=Xt):
-
             transp_Xt = Xt.dot(self.A1_) + self.B1_
 
             return transp_Xt
@@ -1224,6 +1438,9 @@ class SinkhornTransport(BaseTransport):
     .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
            Transport, Advances in Neural Information Processing Systems (NIPS)
            26, 2013
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, reg_e=1., max_iter=1000,
@@ -1231,7 +1448,6 @@ class SinkhornTransport(BaseTransport):
                  metric="sqeuclidean", norm=None,
                  distribution_estimation=distribution_estimation_uniform,
                  out_of_sample_map='ferradans', limit_max=np.infty):
-
         self.reg_e = reg_e
         self.max_iter = max_iter
         self.tol = tol
@@ -1323,13 +1539,15 @@ class EMDTransport(BaseTransport):
     .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
            "Optimal Transport for Domain Adaptation," in IEEE Transactions
            on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, metric="sqeuclidean", norm=None, log=False,
                  distribution_estimation=distribution_estimation_uniform,
                  out_of_sample_map='ferradans', limit_max=10,
                  max_iter=100000):
-
         self.metric = metric
         self.norm = norm
         self.log = log
@@ -1431,7 +1649,9 @@ class SinkhornLpl1Transport(BaseTransport):
     .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence
        and applications. arXiv preprint arXiv:1510.06567.
-
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, reg_e=1., reg_cl=0.1,
@@ -1440,7 +1660,6 @@ class SinkhornLpl1Transport(BaseTransport):
                  metric="sqeuclidean", norm=None,
                  distribution_estimation=distribution_estimation_uniform,
                  out_of_sample_map='ferradans', limit_max=np.infty):
-
         self.reg_e = reg_e
         self.reg_cl = reg_cl
         self.max_iter = max_iter
@@ -1481,7 +1700,6 @@ class SinkhornLpl1Transport(BaseTransport):
 
         # check the necessary inputs parameters are here
         if check_params(Xs=Xs, Xt=Xt, ys=ys):
-
             super(SinkhornLpl1Transport, self).fit(Xs, ys, Xt, yt)
 
             returned_ = sinkhorn_lpl1_mm(
@@ -1499,6 +1717,127 @@ class SinkhornLpl1Transport(BaseTransport):
         return self
 
 
+class EMDLaplaceTransport(BaseTransport):
+
+    """Domain Adapatation OT method based on Earth Mover's Distance with Laplacian regularization
+
+    Parameters
+    ----------
+    reg_type : string optional (default='pos')
+        Type of the regularization term: 'pos' and 'disp' for
+        regularization term defined in [2] and [6], respectively.
+    reg_lap : float, optional (default=1)
+        Laplacian regularization parameter
+    reg_src : float, optional (default=0.5)
+        Source relative importance in regularization
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    similarity : string, optional (default="knn")
+        The similarity to use either knn or gaussian
+    similarity_param : int or float, optional (default=None)
+        Parameter for the similarity: number of nearest neighbors or bandwidth
+        if similarity="knn" or "gaussian", respectively. If None is provided,
+        it is set to 3 or the average pairwise squared Euclidean distance, respectively.
+    max_iter : int, optional (default=100)
+        Max number of BCD iterations
+    tol : float, optional (default=1e-5)
+        Stop threshold on relative loss decrease (>0)
+    max_inner_iter : int, optional (default=10)
+        Max number of iterations (inner CG solver)
+    inner_tol : float, optional (default=1e-6)
+        Stop threshold on error (inner CG solver) (>0)
+    log : int, optional (default=False)
+        Controls the logs of the optimization algorithm
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
+
+    Attributes
+    ----------
+    coupling_ : array-like, shape (n_source_samples, n_target_samples)
+        The optimal coupling
+
+    References
+    ----------
+    .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+           "Optimal Transport for Domain Adaptation," in IEEE Transactions
+           on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
+    .. [2] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy,
+        "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching,"
+         in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
+    """
+
+    def __init__(self, reg_type='pos', reg_lap=1., reg_src=1., metric="sqeuclidean",
+                 norm=None, similarity="knn", similarity_param=None, max_iter=100, tol=1e-9,
+                 max_inner_iter=100000, inner_tol=1e-9, log=False, verbose=False,
+                 distribution_estimation=distribution_estimation_uniform,
+                 out_of_sample_map='ferradans'):
+        self.reg = reg_type
+        self.reg_lap = reg_lap
+        self.reg_src = reg_src
+        self.metric = metric
+        self.norm = norm
+        self.similarity = similarity
+        self.sim_param = similarity_param
+        self.max_iter = max_iter
+        self.tol = tol
+        self.max_inner_iter = max_inner_iter
+        self.inner_tol = inner_tol
+        self.log = log
+        self.verbose = verbose
+        self.distribution_estimation = distribution_estimation
+        self.out_of_sample_map = out_of_sample_map
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Build a coupling matrix from source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : array-like, shape (n_source_samples, n_features)
+            The training input samples.
+        ys : array-like, shape (n_source_samples,)
+            The class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_target_samples,)
+            The class labels. If some target samples are unlabeled, fill the
+            yt's elements with -1.
+
+            Warning: Note that, due to this convention -1 cannot be used as a
+            class label
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        super(EMDLaplaceTransport, self).fit(Xs, ys, Xt, yt)
+
+        returned_ = emd_laplace(a=self.mu_s, b=self.mu_t, xs=self.xs_,
+                                xt=self.xt_, M=self.cost_, sim=self.similarity, sim_param=self.sim_param, reg=self.reg, eta=self.reg_lap,
+                                alpha=self.reg_src, numItermax=self.max_iter, stopThr=self.tol, numInnerItermax=self.max_inner_iter,
+                                stopInnerThr=self.inner_tol, log=self.log, verbose=self.verbose)
+
+        # coupling estimation
+        if self.log:
+            self.coupling_, self.log_ = returned_
+        else:
+            self.coupling_ = returned_
+            self.log_ = dict()
+        return self
+
+
 class SinkhornL1l2Transport(BaseTransport):
 
     """Domain Adapatation OT method based on sinkhorn algorithm +
@@ -1554,7 +1893,9 @@ class SinkhornL1l2Transport(BaseTransport):
     .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
        Generalized conditional gradient: analysis of convergence
        and applications. arXiv preprint arXiv:1510.06567.
-
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, reg_e=1., reg_cl=0.1,
@@ -1563,7 +1904,6 @@ class SinkhornL1l2Transport(BaseTransport):
                  metric="sqeuclidean", norm=None,
                  distribution_estimation=distribution_estimation_uniform,
                  out_of_sample_map='ferradans', limit_max=10):
-
         self.reg_e = reg_e
         self.reg_cl = reg_cl
         self.max_iter = max_iter
@@ -1685,7 +2025,6 @@ class MappingTransport(BaseEstimator):
                  norm=None, kernel="linear", sigma=1, max_iter=100, tol=1e-5,
                  max_inner_iter=10, inner_tol=1e-6, log=False, verbose=False,
                  verbose2=False):
-
         self.metric = metric
         self.norm = norm
         self.mu = mu
@@ -1848,7 +2187,9 @@ class UnbalancedSinkhornTransport(BaseTransport):
     .. [1] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016).
     Scaling algorithms for unbalanced transport problems. arXiv preprint
     arXiv:1607.05816.
-
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
     """
 
     def __init__(self, reg_e=1., reg_m=0.1, method='sinkhorn',
@@ -1856,7 +2197,6 @@ class UnbalancedSinkhornTransport(BaseTransport):
                  metric="sqeuclidean", norm=None,
                  distribution_estimation=distribution_estimation_uniform,
                  out_of_sample_map='ferradans', limit_max=10):
-
         self.reg_e = reg_e
         self.reg_m = reg_m
         self.method = method
@@ -1914,3 +2254,267 @@ class UnbalancedSinkhornTransport(BaseTransport):
                 self.log_ = dict()
 
         return self
+
+
+class JCPOTTransport(BaseTransport):
+
+    """Domain Adapatation OT method for multi-source target shift based on Wasserstein barycenter algorithm.
+
+    Parameters
+    ----------
+    reg_e : float, optional (default=1)
+        Entropic regularization parameter
+    max_iter : int, float, optional (default=10)
+        The minimum number of iteration before stopping the optimization
+        algorithm if no it has not converged
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
+        Controls the verbosity of the optimization algorithm
+    log : bool, optional (default=False)
+        Controls the logs of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
+
+    Attributes
+    ----------
+    coupling_ : list of array-like objects, shape K x (n_source_samples, n_target_samples)
+        A set of optimal couplings between each source domain and the target domain
+    proportions_ : array-like, shape (n_classes,)
+        Estimated class proportions in the target domain
+    log_ : dictionary
+        The dictionary of log, empty dic if parameter log is not True
+
+    References
+    ----------
+
+    .. [1] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia
+       "Optimal transport for multi-source domain adaptation under target shift",
+       International Conference on Artificial Intelligence and Statistics (AISTATS),
+       vol. 89, p.849-858, 2019.
+
+    .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
+            Regularized discrete optimal transport. SIAM Journal on Imaging
+            Sciences, 7(3), 1853-1882.
+
+
+    """
+
+    def __init__(self, reg_e=.1, max_iter=10,
+                 tol=10e-9, verbose=False, log=False,
+                 metric="sqeuclidean",
+                 out_of_sample_map='ferradans'):
+        self.reg_e = reg_e
+        self.max_iter = max_iter
+        self.tol = tol
+        self.verbose = verbose
+        self.log = log
+        self.metric = metric
+        self.out_of_sample_map = out_of_sample_map
+
+    def fit(self, Xs, ys=None, Xt=None, yt=None):
+        """Building coupling matrices from a list of source and target sets of samples
+        (Xs, ys) and (Xt, yt)
+
+        Parameters
+        ----------
+        Xs : list of K array-like objects, shape K x (nk_source_samples, n_features)
+            A list of the training input samples.
+        ys : list of K array-like objects, shape K x (nk_source_samples,)
+            A list of the class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_target_samples,)
+            The class labels. If some target samples are unlabeled, fill the
+            yt's elements with -1.
+
+            Warning: Note that, due to this convention -1 cannot be used as a
+            class label
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs, Xt=Xt, ys=ys):
+
+            self.xs_ = Xs
+            self.xt_ = Xt
+
+            returned_ = jcpot_barycenter(Xs=Xs, Ys=ys, Xt=Xt, reg=self.reg_e,
+                                         metric=self.metric, distrinumItermax=self.max_iter, stopThr=self.tol,
+                                         verbose=self.verbose, log=True)
+
+            self.coupling_ = returned_[1]['gamma']
+
+            # deal with the value of log
+            if self.log:
+                self.proportions_, self.log_ = returned_
+            else:
+                self.proportions_ = returned_
+                self.log_ = dict()
+
+        return self
+
+    def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
+        """Transports source samples Xs onto target ones Xt
+
+        Parameters
+        ----------
+        Xs : list of K array-like objects, shape K x (nk_source_samples, n_features)
+            A list of the training input samples.
+        ys : list of K array-like objects, shape K x (nk_source_samples,)
+            A list of the class labels
+        Xt : array-like, shape (n_target_samples, n_features)
+            The training input samples.
+        yt : array-like, shape (n_target_samples,)
+            The class labels. If some target samples are unlabeled, fill the
+            yt's elements with -1.
+
+            Warning: Note that, due to this convention -1 cannot be used as a
+            class label
+        batch_size : int, optional (default=128)
+            The batch size for out of sample inverse transform
+        """
+
+        transp_Xs = []
+
+        # check the necessary inputs parameters are here
+        if check_params(Xs=Xs):
+
+            if all([np.allclose(x, y) for x, y in zip(self.xs_, Xs)]):
+
+                # perform standard barycentric mapping for each source domain
+
+                for coupling in self.coupling_:
+                    transp = coupling / np.sum(coupling, 1)[:, None]
+
+                    # set nans to 0
+                    transp[~ np.isfinite(transp)] = 0
+
+                    # compute transported samples
+                    transp_Xs.append(np.dot(transp, self.xt_))
+            else:
+
+                # perform out of sample mapping
+                indices = np.arange(Xs.shape[0])
+                batch_ind = [
+                    indices[i:i + batch_size]
+                    for i in range(0, len(indices), batch_size)]
+
+                transp_Xs = []
+
+                for bi in batch_ind:
+                    transp_Xs_ = []
+
+                    # get the nearest neighbor in the sources domains
+                    xs = np.concatenate(self.xs_, axis=0)
+                    idx = np.argmin(dist(Xs[bi], xs), axis=1)
+
+                    # transport the source samples
+                    for coupling in self.coupling_:
+                        transp = coupling / np.sum(
+                            coupling, 1)[:, None]
+                        transp[~ np.isfinite(transp)] = 0
+                        transp_Xs_.append(np.dot(transp, self.xt_))
+
+                    transp_Xs_ = np.concatenate(transp_Xs_, axis=0)
+
+                    # define the transported points
+                    transp_Xs_ = transp_Xs_[idx, :] + Xs[bi] - xs[idx, :]
+                    transp_Xs.append(transp_Xs_)
+
+                transp_Xs = np.concatenate(transp_Xs, axis=0)
+
+            return transp_Xs
+
+    def transform_labels(self, ys=None):
+        """Propagate source labels ys to obtain target labels as in [27]
+
+        Parameters
+        ----------
+        ys : list of K array-like objects, shape K x (nk_source_samples,)
+            A list of the class labels
+
+        Returns
+        -------
+        yt : array-like, shape (n_target_samples, nb_classes)
+            Estimated soft target labels.
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(ys=ys):
+            yt = np.zeros((len(np.unique(np.concatenate(ys))), self.xt_.shape[0]))
+            for i in range(len(ys)):
+                ysTemp = label_normalization(np.copy(ys[i]))
+                classes = np.unique(ysTemp)
+                n = len(classes)
+                ns = len(ysTemp)
+
+                # perform label propagation
+                transp = self.coupling_[i] / np.sum(self.coupling_[i], 1)[:, None]
+
+                # set nans to 0
+                transp[~ np.isfinite(transp)] = 0
+
+                if self.log:
+                    D1 = self.log_['D1'][i]
+                else:
+                    D1 = np.zeros((n, ns))
+
+                    for c in classes:
+                        D1[int(c), ysTemp == c] = 1
+
+                # compute propagated labels
+                yt = yt + np.dot(D1, transp) / len(ys)
+
+            return yt.T
+
+    def inverse_transform_labels(self, yt=None):
+        """Propagate source labels ys to obtain target labels
+
+        Parameters
+        ----------
+        yt : array-like, shape (n_source_samples,)
+            The target class labels
+
+        Returns
+        -------
+        transp_ys : list of K array-like objects, shape K x (nk_source_samples, nb_classes)
+            A list of estimated soft source labels
+        """
+
+        # check the necessary inputs parameters are here
+        if check_params(yt=yt):
+            transp_ys = []
+            ytTemp = label_normalization(np.copy(yt))
+            classes = np.unique(ytTemp)
+            n = len(classes)
+            D1 = np.zeros((n, len(ytTemp)))
+
+            for c in classes:
+                D1[int(c), ytTemp == c] = 1
+
+            for i in range(len(self.xs_)):
+
+                # perform label propagation
+                transp = self.coupling_[i] / np.sum(self.coupling_[i], 1)[:, None]
+
+                # set nans to 0
+                transp[~ np.isfinite(transp)] = 0
+
+                # compute propagated labels
+                transp_ys.append(np.dot(D1, transp.T).T)
+
+            return transp_ys
diff --git a/ot/datasets.py b/ot/datasets.py
index ba0cfd9..b86ef3b 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -1,5 +1,5 @@
 """
-Simple example datasets for OT
+Simple example datasets
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -30,7 +30,7 @@ def make_1D_gauss(n, m, s):
         1D histogram for a gaussian distribution
     """
     x = np.arange(n, dtype=np.float64)
-    h = np.exp(-(x - m)**2 / (2 * s**2))
+    h = np.exp(-(x - m) ** 2 / (2 * s ** 2))
     return h / h.sum()
 
 
@@ -80,7 +80,7 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None):
     return make_2D_samples_gauss(n, m, sigma, random_state=None)
 
 
-def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
+def make_data_classif(dataset, n, nz=.5, theta=0, p=.5, random_state=None, **kwargs):
     """Dataset generation for classification problems
 
     Parameters
@@ -91,6 +91,8 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
         number of training samples
     nz : float
         noise level (>0)
+    p : float
+        proportion of one class in the binary setting
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
@@ -145,11 +147,22 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs):
         n2 = np.sum(y == 2)
         x = np.zeros((n, 2))
 
-        x[y == 1, :] = get_2D_samples_gauss(n1, m1, nz, random_state=generator)
-        x[y == 2, :] = get_2D_samples_gauss(n2, m2, nz, random_state=generator)
+        x[y == 1, :] = make_2D_samples_gauss(n1, m1, nz, random_state=generator)
+        x[y == 2, :] = make_2D_samples_gauss(n2, m2, nz, random_state=generator)
 
         x = x.dot(rot)
 
+    elif dataset.lower() == '2gauss_prop':
+
+        y = np.concatenate((np.ones(int(p * n)), np.zeros(int((1 - p) * n))))
+        x = np.hstack((0 * y[:, None] - 0, 1 - 2 * y[:, None])) + nz * np.random.randn(len(y), 2)
+
+        if ('bias' not in kwargs) and ('b' not in kwargs):
+            kwargs['bias'] = np.array([0, 2])
+
+        x[:, 0] += kwargs['bias'][0]
+        x[:, 1] += kwargs['bias'][1]
+
     else:
         x = np.array(0)
         y = np.array(0)
diff --git a/ot/dr.py b/ot/dr.py
index 680dabf..11d2e10 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 """
-Dimension reduction with optimal transport
+Dimension reduction with OT
 
 
 .. warning::
-    Note that by default the module is not import in :mod:`ot`. In order to
+    Note that by default the module is not imported in :mod:`ot`. In order to
     use it you need to explicitely import :mod:`ot.dr`
 
 """
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index 1ab95bb..7478fb9 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 """
+GPU implementation for several OT solvers and utility 
+functions. 
 
-This module provides GPU implementation for several OT solvers and utility 
-functions. The GPU backend in handled by `cupy 
+The GPU backend in handled by `cupy 
 <https://cupy.chainer.org/>`_.
 
 .. warning::
diff --git a/ot/gromov.py b/ot/gromov.py
index 699ae4c..4427a96 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Gromov-Wasserstein transport method
+Gromov-Wasserstein and Fused-Gromov-Wasserstein solvers
 """
 
 # Author: Erwan Vautier <erwan.vautier@gmail.com>
@@ -276,7 +276,6 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     - p  : distribution in the source space
     - q  : distribution in the target space
     - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
 
     Parameters
     ----------
@@ -343,6 +342,83 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
         return cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
 
 
+def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
+    """
+    Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
+
+    The function solves the following optimization problem:
+
+    .. math::
+        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+    Where :
+    - C1 : Metric cost matrix in the source space
+    - C2 : Metric cost matrix in the target space
+    - p  : distribution in the source space
+    - q  : distribution in the target space
+    - L  : loss function to account for the misfit between the similarity matrices
+
+    Parameters
+    ----------
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space.
+    q :  ndarray, shape (nt,)
+        Distribution in the target space.
+    loss_fun :  str
+        loss function used for the solver either 'square_loss' or 'kl_loss'
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+    armijo : bool, optional
+        If True the steps of the line-search is found via an armijo research. Else closed form is used.
+        If there is convergence issues use False.
+
+    Returns
+    -------
+    gw_dist : float
+        Gromov-Wasserstein distance
+    log : dict
+        convergence information and Coupling marix
+
+    References
+    ----------
+    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
+
+    .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
+        metric approach to object matching. Foundations of computational
+        mathematics 11.4 (2011): 417-487.
+
+    """
+
+    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+
+    G0 = p[:, None] * q[None, :]
+
+    def f(G):
+        return gwloss(constC, hC1, hC2, G)
+
+    def df(G):
+        return gwggrad(constC, hC1, hC2, G)
+    res, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+    log_gw['gw_dist'] = gwloss(constC, hC1, hC2, res)
+    log_gw['T'] = res
+    if log:
+        return log_gw['gw_dist'], log_gw
+    else:
+        return log_gw['gw_dist']
+
+
 def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     """
     Computes the FGW transport between two graphs see [24]
@@ -357,8 +433,7 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     where :
     - M is the (ns,nt) metric cost matrix
-    - :math:`f` is the regularization term ( and df is its gradient)
-    - a and b are source and target weights (sum to 1)
+    - p and q are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
 
     The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
@@ -377,17 +452,13 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         Distribution in the target space
     loss_fun : str, optional
         Loss function used for the solver
-    max_iter : int, optional
-        Max number of iterations
-    tol : float, optional
-        Stop threshold on error (>0)
-    verbose : bool, optional
-        Print information along iterations
-    log : bool, optional
-        record log if True
+    alpha : float, optional
+        Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
         If True the steps of the line-search is found via an armijo research. Else closed form is used.
         If there is convergence issues use False.
+    log : bool, optional
+        record log if True
     **kwargs : dict
         parameters can be directly passed to the ot.optim.cg solver
 
@@ -417,11 +488,11 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
         return gwggrad(constC, hC1, hC2, G)
 
     if log:
-        res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+        res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
         log['fgw_dist'] = log['loss'][::-1][0]
         return res, log
     else:
-        return cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+        return cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
 
 
 def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
@@ -439,8 +510,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
 
     where :
     - M is the (ns,nt) metric cost matrix
-    - :math:`f` is the regularization term ( and df is its gradient)
-    - a and b are source and target weights (sum to 1)
+    - p and q are source and target weights (sum to 1)
     - L is a loss function to account for the misfit between the similarity matrices
     The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
 
@@ -458,17 +528,13 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
         Distribution in the target space.
     loss_fun : str, optional
         Loss function used for the solver.
-    max_iter : int, optional
-        Max number of iterations
-    tol : float, optional
-        Stop threshold on error (>0)
-    verbose : bool, optional
-        Print information along iterations
-    log : bool, optional
-        Record log if True.
+    alpha : float, optional
+        Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
         If True the steps of the line-search is found via an armijo research.
         Else closed form is used. If there is convergence issues use False.
+    log : bool, optional
+        Record log if True.
     **kwargs : dict
         Parameters can be directly pased to the ot.optim.cg solver.
 
@@ -497,7 +563,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     def df(G):
         return gwggrad(constC, hC1, hC2, G)
 
-    res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+    res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
     if log:
         log['fgw_dist'] = log['loss'][::-1][0]
         log['T'] = res
@@ -506,84 +572,6 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
         return log['fgw_dist']
 
 
-def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
-    """
-    Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
-
-    The function solves the following optimization problem:
-
-    .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
-
-    Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
-
-    Parameters
-    ----------
-    C1 : ndarray, shape (ns, ns)
-        Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric cost matrix in the target space
-    p : ndarray, shape (ns,)
-        Distribution in the source space.
-    q :  ndarray, shape (nt,)
-        Distribution in the target space.
-    loss_fun :  str
-        loss function used for the solver either 'square_loss' or 'kl_loss'
-    max_iter : int, optional
-        Max number of iterations
-    tol : float, optional
-        Stop threshold on error (>0)
-    verbose : bool, optional
-        Print information along iterations
-    log : bool, optional
-        record log if True
-    armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
-
-    Returns
-    -------
-    gw_dist : float
-        Gromov-Wasserstein distance
-    log : dict
-        convergence information and Coupling marix
-
-    References
-    ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
-        "Gromov-Wasserstein averaging of kernel and distance matrices."
-        International Conference on Machine Learning (ICML). 2016.
-
-    .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
-        metric approach to object matching. Foundations of computational
-        mathematics 11.4 (2011): 417-487.
-
-    """
-
-    constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
-    G0 = p[:, None] * q[None, :]
-
-    def f(G):
-        return gwloss(constC, hC1, hC2, G)
-
-    def df(G):
-        return gwggrad(constC, hC1, hC2, G)
-    res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
-    log['gw_dist'] = gwloss(constC, hC1, hC2, res)
-    log['T'] = res
-    if log:
-        return log['gw_dist'], log
-    else:
-        return log['gw_dist']
-
-
 def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
                                 max_iter=1000, tol=1e-9, verbose=False, log=False):
     """
@@ -996,6 +984,16 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
         Whether to fix the structure of the barycenter during the updates
     fixed_features : bool
         Whether to fix the feature of the barycenter during the updates
+    loss_fun : str
+        Loss function used for the solver either 'square_loss' or 'kl_loss'
+    max_iter : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshol on error (>0).
+    verbose : bool, optional
+        Print information along iterations.
+    log : bool, optional
+        Record log if True.
     init_C : ndarray, shape (N,N), optional
         Initialization for the barycenters' structure matrix. If not set
         a random init is used.
@@ -1084,7 +1082,7 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
                 T_temp = [t.T for t in T]
                 C = update_sructure_matrix(p, lambdas, T_temp, Cs)
 
-        T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
+        T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
                                       numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
 
         # T is N,ns
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index f42e222..c0fe7a3 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -32,4 +32,6 @@ enum ProblemType {
 
 int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter);
 
+
+
 #endif
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index fc7ca63..bc873ed 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -17,13 +17,13 @@
 
 int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
                 double* alpha, double* beta, double *cost, int maxIter)  {
-// beware M and C anre strored in row major C style!!!
-    int n, m, i, cur;
+    // beware M and C anre strored in row major C style!!!
+     int n, m, i, cur;
 
     typedef FullBipartiteDigraph Digraph;
-  DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+    DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
 
-  // Get the number of non zero coordinates for r and c
+    // Get the number of non zero coordinates for r and c
     n=0;
     for (int i=0; i<n1; i++) {
         double val=*(X+i);
@@ -105,3 +105,4 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
 
     return ret;
 }
+
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 0c92810..514a607 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -2,8 +2,6 @@
 """
 Solvers for the original linear program OT problem
 
-
-
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -12,22 +10,169 @@ Solvers for the original linear program OT problem
 
 import multiprocessing
 import sys
+
 import numpy as np
 from scipy.sparse import coo_matrix
 
-from .import cvx
-
+from . import cvx
+from .cvx import barycenter
 # import compiled emd
 from .emd_wrap import emd_c, check_result, emd_1d_sorted
-from ..utils import parmap
-from .cvx import barycenter
 from ..utils import dist
+from ..utils import parmap
+
+__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+           'emd_1d', 'emd2_1d', 'wasserstein_1d']
+
+
+def center_ot_dual(alpha0, beta0, a=None, b=None):
+    r"""Center dual OT potentials w.r.t. theirs weights
+
+    The main idea of this function is to find unique dual potentials
+    that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having
+    stability when multiple calling of the OT solver with small changes.
+
+    Basically we add another constraint to the potential that will not
+    change the objective value but will ensure unicity. The constraint
+    is the following:
+
+    .. math::
+        \alpha^T a= \beta^T b
+
+    in addition to the OT problem constraints.
+
+    since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing
+    a constant from both  :math:`\alpha_0` and :math:`\beta_0`.
+
+    .. math::
+        c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta}
+
+        \alpha=\alpha_0+c
+
+        \beta=\beta0+c
+
+    Parameters
+    ----------
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+
+    Returns
+    -------
+    alpha : (ns,) numpy.ndarray, float64
+        Source centered dual potential
+    beta : (nt,) numpy.ndarray, float64
+        Target centered dual potential
+
+    """
+    # if no weights are provided, use uniform
+    if a is None:
+        a = np.ones(alpha0.shape[0]) / alpha0.shape[0]
+    if b is None:
+        b = np.ones(beta0.shape[0]) / beta0.shape[0]
+
+    # compute constant that balances the weighted sums of the duals
+    c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum())
+
+    # update duals
+    alpha = alpha0 + c
+    beta = beta0 - c
+
+    return alpha, beta
+
+
+def estimate_dual_null_weights(alpha0, beta0, a, b, M):
+    r"""Estimate feasible values for 0-weighted dual potentials
+
+    The feasible values are computed efficiently but rather coarsely.
+
+    .. warning::
+        This function is necessary because the C++ solver in emd_c
+        discards all samples in the distributions with 
+        zeros weights. This means that while the primal variable (transport 
+        matrix) is exact, the solver only returns feasible dual potentials
+        on the samples with weights different from zero. 
+
+    First we compute the constraints violations:
+
+    .. math::
+        V=\alpha+\beta^T-M
+
+    Next we compute the max amount of violation per row (alpha) and
+    columns (beta)
 
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
-         'emd_1d', 'emd2_1d', 'wasserstein_1d']
+    .. math::
+        v^a_i=\max_j V_{i,j}
+
+        v^b_j=\max_i V_{i,j}
+
+    Finally we update the dual potential with 0 weights if a
+    constraint is violated
+
+    .. math::
+        \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0
+
+        \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0
+
+    In the end the dual potentials are centered using function
+    :ref:`center_ot_dual`.
+
+    Note that all those updates do not change the objective value of the
+    solution but provide dual potentials that do not violate the constraints.
+
+    Parameters
+    ----------
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    alpha0 : (ns,) numpy.ndarray, float64
+        Source dual potential
+    beta0 : (nt,) numpy.ndarray, float64
+        Target dual potential
+    a : (ns,) numpy.ndarray, float64
+        Source distribution (uniform weights if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target distribution (uniform weights if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
+
+    Returns
+    -------
+    alpha : (ns,) numpy.ndarray, float64
+        Source corrected dual potential
+    beta : (nt,) numpy.ndarray, float64
+        Target corrected dual potential
+
+    """
+
+    # binary indexing of non-zeros weights
+    asel = a != 0
+    bsel = b != 0
+
+    # compute dual constraints violation
+    constraint_violation = alpha0[:, None] + beta0[None, :] - M
+
+    # Compute largest violation per line and columns
+    aviol = np.max(constraint_violation, 1)
+    bviol = np.max(constraint_violation, 0)
 
+    # update corrects violation of
+    alpha_up = -1 * ~asel * np.maximum(aviol, 0)
+    beta_up = -1 * ~bsel * np.maximum(bviol, 0)
 
-def emd(a, b, M, numItermax=100000, log=False):
+    alpha = alpha0 + alpha_up
+    beta = beta0 + beta_up
+
+    return center_ot_dual(alpha, beta, a, b)
+
+
+def emd(a, b, M, numItermax=100000, log=False, center_dual=True):
     r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
@@ -35,7 +180,9 @@ def emd(a, b, M, numItermax=100000, log=False):
         \gamma = arg\min_\gamma <\gamma,M>_F
 
         s.t. \gamma 1 = a
+
              \gamma^T 1= b
+
              \gamma\geq 0
     where :
 
@@ -43,7 +190,7 @@ def emd(a, b, M, numItermax=100000, log=False):
     - a and b are the sample weights
 
     .. warning::
-        Note that the M matrix needs to be a C-order numpy.array in float64 
+        Note that the M matrix needs to be a C-order numpy.array in float64
         format.
 
     Uses the algorithm proposed in [1]_
@@ -62,6 +209,9 @@ def emd(a, b, M, numItermax=100000, log=False):
     log: bool, optional (default=False)
         If True, returns a dictionary containing the cost and dual
         variables. Otherwise returns only the optimal transportation matrix.
+    center_dual: boolean, optional (default=True)
+        If True, centers the dual potential using function
+        :ref:`center_ot_dual`.
 
     Returns
     -------
@@ -109,7 +259,20 @@ def emd(a, b, M, numItermax=100000, log=False):
     if len(b) == 0:
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
+    assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+        "Dimension mismatch, check dimensions of M with a and b"
+
+    asel = a != 0
+    bsel = b != 0
+
     G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+
+    if center_dual:
+        u, v = center_ot_dual(u, v, a, b)
+
+    if np.any(~asel) or np.any(~bsel):
+        u, v = estimate_dual_null_weights(u, v, a, b, M)
+    
     result_code_string = check_result(result_code)
     if log:
         log = {}
@@ -123,14 +286,17 @@ def emd(a, b, M, numItermax=100000, log=False):
 
 
 def emd2(a, b, M, processes=multiprocessing.cpu_count(),
-         numItermax=100000, log=False, return_matrix=False):
+         numItermax=100000, log=False, return_matrix=False,
+         center_dual=True):
     r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F
+        \min_\gamma <\gamma,M>_F
 
         s.t. \gamma 1 = a
+
              \gamma^T 1= b
+
              \gamma\geq 0
     where :
 
@@ -138,7 +304,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     - a and b are the sample weights
 
     .. warning::
-        Note that the M matrix needs to be a C-order numpy.array in float64 
+        Note that the M matrix needs to be a C-order numpy.array in float64
         format.
 
     Uses the algorithm proposed in [1]_
@@ -161,6 +327,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
         variables. Otherwise returns only the optimal transportation cost.
     return_matrix: boolean, optional (default=False)
         If True, returns the optimal transportation matrix in the log.
+    center_dual: boolean, optional (default=True)
+        If True, centers the dual potential using function
+        :ref:`center_ot_dual`.
 
     Returns
     -------
@@ -204,7 +373,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
 
     # problem with pikling Forks
     if sys.platform.endswith('win32'):
-        processes=1
+        processes = 1
 
     # if empty array given then use uniform distributions
     if len(a) == 0:
@@ -212,21 +381,43 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     if len(b) == 0:
         b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
 
+    assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+        "Dimension mismatch, check dimensions of M with a and b"
+
+    asel = a != 0
+
     if log or return_matrix:
         def f(b):
-            G, cost, u, v, resultCode = emd_c(a, b, M, numItermax)
-            result_code_string = check_result(resultCode)
+            bsel = b != 0
+            
+            G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+
+            if center_dual:
+                u, v = center_ot_dual(u, v, a, b)
+
+            if np.any(~asel) or np.any(~bsel):
+                u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+            result_code_string = check_result(result_code)
             log = {}
             if return_matrix:
                 log['G'] = G
             log['u'] = u
             log['v'] = v
             log['warning'] = result_code_string
-            log['result_code'] = resultCode
+            log['result_code'] = result_code
             return [cost, log]
     else:
         def f(b):
+            bsel = b != 0
             G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+
+            if center_dual:
+                u, v = center_ot_dual(u, v, a, b)
+
+            if np.any(~asel) or np.any(~bsel):
+                u, v = estimate_dual_null_weights(u, v, a, b, M)
+
             check_result(result_code)
             return cost
 
@@ -234,7 +425,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
         return f(b)
     nb = b.shape[1]
 
-    if processes>1:
+    if processes > 1:
         res = parmap(f, [b[:, i] for i in range(nb)], processes)
     else:
         res = list(map(f, [b[:, i].copy() for i in range(nb)]))
@@ -242,8 +433,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     return res
 
 
-
-def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None):
+def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100,
+                            stopThr=1e-7, verbose=False, log=None):
     """
     Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance)
 
@@ -295,7 +486,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     k = X_init.shape[0]
     d = X_init.shape[1]
     if b is None:
-        b = np.ones((k,))/k
+        b = np.ones((k,)) / k
     if weights is None:
         weights = np.ones((N,)) / N
 
@@ -306,17 +497,17 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     displacement_square_norm = stopThr + 1.
 
-    while ( displacement_square_norm > stopThr and iter_count < numItermax ):
+    while (displacement_square_norm > stopThr and iter_count < numItermax):
 
         T_sum = np.zeros((k, d))
 
-        for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, weights.tolist()):
-
+        for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights,
+                                                                      weights.tolist()):
             M_i = dist(X, measure_locations_i)
             T_i = emd(b, measure_weights_i, M_i)
             T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
 
-        displacement_square_norm = np.sum(np.square(T_sum-X))
+        displacement_square_norm = np.sum(np.square(T_sum - X))
         if log:
             displacement_square_norms.append(displacement_square_norm)
 
@@ -436,12 +627,12 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
     if b.ndim == 0 or len(b) == 0:
         b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
 
-    x_a_1d = x_a.reshape((-1, ))
-    x_b_1d = x_b.reshape((-1, ))
+    x_a_1d = x_a.reshape((-1,))
+    x_b_1d = x_b.reshape((-1,))
     perm_a = np.argsort(x_a_1d)
     perm_b = np.argsort(x_b_1d)
 
-    G_sorted, indices, cost = emd_1d_sorted(a, b,
+    G_sorted, indices, cost = emd_1d_sorted(a[perm_a], b[perm_b],
                                             x_a_1d[perm_a], x_b_1d[perm_b],
                                             metric=metric, p=p)
     G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 2b6c495..c167964 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -19,7 +19,7 @@ import warnings
 
 
 cdef extern from "EMD.h":
-    int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter)
+    int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) nogil
     cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED
 
 
@@ -35,8 +35,7 @@ def check_result(result_code):
         message = "numItermax reached before optimality. Try to increase numItermax."
     warnings.warn(message)
     return message
-
-
+ 
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"]  b, np.ndarray[double, ndim=2, mode="c"]  M, int max_iter):
@@ -61,6 +60,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     .. warning::
         Note that the M matrix needs to be a C-order :py.cls:`numpy.array`
 
+    .. warning::
+        The C++ solver discards all samples in the distributions with 
+        zeros weights. This means that while the primal variable (transport 
+        matrix) is exact, the solver only returns feasible dual potentials
+        on the samples with weights different from zero. 
+
     Parameters
     ----------
     a : (ns,) numpy.ndarray, float64
@@ -73,7 +78,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
 
-
     Returns
     -------
     gamma: (ns x nt) numpy.ndarray
@@ -82,12 +86,19 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     """
     cdef int n1= M.shape[0]
     cdef int n2= M.shape[1]
+    cdef int nmax=n1+n2-1
+    cdef int result_code = 0
+    cdef int nG=0
 
     cdef double cost=0
-    cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2])
     cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1)
     cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2)
 
+    cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0])
+
+    cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0)
+    cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int)
+    cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int)
 
     if not len(a):
         a=np.ones((n1,))/n1
@@ -95,8 +106,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     if not len(b):
         b=np.ones((n2,))/n2
 
+    # init OT matrix
+    G=np.zeros([n1, n2])
+
     # calling the function
-    cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+    with nogil:
+        result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
 
     return G, cost, alpha, beta, result_code
 
diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h
index 7c6a4ce..5d93040 100644
--- a/ot/lp/network_simplex_simple.h
+++ b/ot/lp/network_simplex_simple.h
@@ -686,7 +686,7 @@ namespace lemon {
         /// \see resetParams(), reset()
         ProblemType run() {
 #if DEBUG_LVL>0
-            std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n";
+            std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ;
 #endif
 
             if (!init()) return INFEASIBLE;
@@ -875,7 +875,7 @@ namespace lemon {
              c += Number(it->second) * Number(_cost[it->first]);
              return c;*/
 
-            for (int i=0; i<_flow.size(); i++)
+            for (unsigned long i=0; i<_flow.size(); i++)
                 c += _flow[i] * Number(_cost[i]);
             return c;
 
@@ -1257,7 +1257,7 @@ namespace lemon {
                 u = w;
             }
             _pred[u_in] = in_arc;
-            _forward[u_in] = (u_in == _source[in_arc]);
+            _forward[u_in] = ((unsigned int)u_in == _source[in_arc]);
             _succ_num[u_in] = old_succ_num;
 
             // Set limits for updating _last_succ form v_in and v_out
@@ -1418,7 +1418,6 @@ namespace lemon {
         template <typename PivotRuleImpl>
         ProblemType start() {
             PivotRuleImpl pivot(*this);
-            double prevCost=-1;
 			ProblemType retVal = OPTIMAL;
 
             // Perform heuristic initial pivots
diff --git a/ot/optim.py b/ot/optim.py
index 0abd9e9..b9ca891 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Optimization algorithms for OT
+Generic solvers for regularized OT
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -134,7 +134,7 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val,
     return alpha, fc, f_val
 
 
-def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
+def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000,
        stopThr=1e-9, stopThr2=1e-9, verbose=False, log=False, **kwargs):
     """
     Solve the general regularized OT problem with conditional gradient
@@ -172,6 +172,8 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
         initial guess (default is indep joint density)
     numItermax : int, optional
         Max number of iterations
+    numItermaxEmd : int, optional
+        Max number of iterations for emd
     stopThr : float, optional
         Stop threshol on the relative variation (>0)
     stopThr2 : float, optional
@@ -238,7 +240,7 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200,
         Mi += Mi.min()
 
         # solve linear program
-        Gc = emd(a, b, Mi)
+        Gc = emd(a, b, Mi, numItermax=numItermaxEmd)
 
         deltaG = Gc - G
 
diff --git a/ot/partial.py b/ot/partial.py
new file mode 100755
index 0000000..eb707d8
--- /dev/null
+++ b/ot/partial.py
@@ -0,0 +1,1062 @@
+# -*- coding: utf-8 -*-
+"""
+Partial OT solvers
+"""
+
+# Author: Laetitia Chapel <laetitia.chapel@irisa.fr>
+# License: MIT License
+
+import numpy as np
+
+from .lp import emd
+
+
+def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False,
+                                 **kwargs):
+    r"""
+    Solves the partial optimal transport problem for the quadratic cost
+    and returns the OT plan
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = \arg\min_\gamma <\gamma,(M-\lambda)>_F
+
+        s.t.
+             \gamma\geq 0 \\
+             \gamma 1 \leq a\\
+             \gamma^T 1 \leq b\\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+
+
+    or equivalently (see Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X.
+    (2018). An interpolating distance between optimal transport and Fisher–Rao
+    metrics. Foundations of Computational Mathematics, 18(1), 1-44.)
+
+    .. math::
+        \gamma = \arg\min_\gamma <\gamma,M>_F  + \sqrt(\lambda/2)
+        (\|\gamma 1 - a\|_1 + \|\gamma^T 1 - b\|_1)
+
+        s.t.
+             \gamma\geq 0 \\
+
+
+    where :
+
+    - M is the metric cost matrix
+    - a and b are source and target unbalanced distributions
+    - :math:`\lambda` is the lagragian cost. Tuning its value allows attaining
+      a given mass to be transported m
+
+    The formulation of the problem has been proposed in [28]_
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,)
+        Unnormalized histograms of dimension dim_b
+    M : np.ndarray (dim_a, dim_b)
+        cost matrix for the quadratic cost
+    reg_m : float, optional
+        Lagragian cost
+    nb_dummies : int, optional, default:1
+        number of reservoir points to be added (to avoid numerical
+        instabilities, increase its value if an error is raised)
+    log : bool, optional
+        record log if True
+    **kwargs : dict
+        parameters can be directly passed to the emd solver
+
+    .. warning::
+        When dealing with a large number of points, the EMD solver may face
+        some instabilities, especially when the mass associated to the dummy
+        point is large. To avoid them, increase the number of dummy points
+        (allows a smoother repartition of the mass over the points).
+
+    Returns
+    -------
+    gamma : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a = [.1, .2]
+    >>> b = [.1, .1]
+    >>> M = [[0., 1.], [2., 3.]]
+    >>> np.round(partial_wasserstein_lagrange(a,b,M), 2)
+    array([[0.1, 0. ],
+           [0. , 0.1]])
+    >>> np.round(partial_wasserstein_lagrange(a,b,M,reg_m=2), 2)
+    array([[0.1, 0. ],
+           [0. , 0. ]])
+
+    References
+    ----------
+
+    .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in
+       optimal transport and Monge-Ampere obstacle problems. Annals of
+       mathematics, 673-730.
+
+    See Also
+    --------
+    ot.partial.partial_wasserstein : Partial Wasserstein with fixed mass
+    """
+
+    if np.sum(a) > 1 or np.sum(b) > 1:
+        raise ValueError("Problem infeasible. Check that a and b are in the "
+                         "simplex")
+
+    if reg_m is None:
+        reg_m = np.max(M) + 1
+    if reg_m < -np.max(M):
+        return np.zeros((len(a), len(b)))
+
+    eps = 1e-20
+    M = np.asarray(M, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    a = np.asarray(a, dtype=np.float64)
+
+    M_star = M - reg_m  # modified cost matrix
+
+    # trick to fasten the computation: select only the subset of columns/lines
+    # that can have marginals greater than 0 (that is to say M < 0)
+    idx_x = np.where(np.min(M_star, axis=1) < eps)[0]
+    idx_y = np.where(np.min(M_star, axis=0) < eps)[0]
+
+    # extend a, b, M with "reservoir" or "dummy" points
+    M_extended = np.zeros((len(idx_x) + nb_dummies, len(idx_y) + nb_dummies))
+    M_extended[:len(idx_x), :len(idx_y)] = M_star[np.ix_(idx_x, idx_y)]
+
+    a_extended = np.append(a[idx_x], [(np.sum(a) - np.sum(a[idx_x]) +
+                                       np.sum(b)) / nb_dummies] * nb_dummies)
+    b_extended = np.append(b[idx_y], [(np.sum(b) - np.sum(b[idx_y]) +
+                                       np.sum(a)) / nb_dummies] * nb_dummies)
+
+    gamma_extended, log_emd = emd(a_extended, b_extended, M_extended, log=True,
+                                  **kwargs)
+    gamma = np.zeros((len(a), len(b)))
+    gamma[np.ix_(idx_x, idx_y)] = gamma_extended[:-nb_dummies, :-nb_dummies]
+
+    if log_emd['warning'] is not None:
+        raise ValueError("Error in the EMD resolution: try to increase the"
+                         " number of dummy points")
+    log_emd['cost'] = np.sum(gamma * M)
+    if log:
+        return gamma, log_emd
+    else:
+        return gamma
+
+
+def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
+    r"""
+    Solves the partial optimal transport problem for the quadratic cost
+    and returns the OT plan
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = \arg\min_\gamma <\gamma,M>_F
+
+        s.t.
+             \gamma\geq 0 \\
+             \gamma 1 \leq a\\
+             \gamma^T 1 \leq b\\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+
+
+    where :
+
+    - M is the metric cost matrix
+    - a and b are source and target unbalanced distributions
+    - m is the amount of mass to be transported
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,)
+        Unnormalized histograms of dimension dim_b
+    M : np.ndarray (dim_a, dim_b)
+        cost matrix for the quadratic cost
+    m : float, optional
+        amount of mass to be transported
+    nb_dummies : int, optional, default:1
+        number of reservoir points to be added (to avoid numerical
+        instabilities, increase its value if an error is raised)
+    log : bool, optional
+        record log if True
+    **kwargs : dict
+        parameters can be directly passed to the emd solver
+
+
+    .. warning::
+        When dealing with a large number of points, the EMD solver may face
+        some instabilities, especially when the mass associated to the dummy
+        point is large. To avoid them, increase the number of dummy points
+        (allows a smoother repartition of the mass over the points).
+
+
+    Returns
+    -------
+    :math:`gamma` : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a = [.1, .2]
+    >>> b = [.1, .1]
+    >>> M = [[0., 1.], [2., 3.]]
+    >>> np.round(partial_wasserstein(a,b,M), 2)
+    array([[0.1, 0. ],
+           [0. , 0.1]])
+    >>> np.round(partial_wasserstein(a,b,M,m=0.1), 2)
+    array([[0.1, 0. ],
+           [0. , 0. ]])
+
+    References
+    ----------
+    ..  [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in
+        optimal transport and Monge-Ampere obstacle problems. Annals of
+        mathematics, 673-730.
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+
+    See Also
+    --------
+    ot.partial.partial_wasserstein_lagrange: Partial Wasserstein with
+    regularization on the marginals
+    ot.partial.entropic_partial_wasserstein: Partial Wasserstein with a
+    entropic regularization parameter
+    """
+
+    if m is None:
+        return partial_wasserstein_lagrange(a, b, M, log=log, **kwargs)
+    elif m < 0:
+        raise ValueError("Problem infeasible. Parameter m should be greater"
+                         " than 0.")
+    elif m > np.min((np.sum(a), np.sum(b))):
+        raise ValueError("Problem infeasible. Parameter m should lower or"
+                         " equal than min(|a|_1, |b|_1).")
+
+    b_extended = np.append(b, [(np.sum(a) - m) / nb_dummies] * nb_dummies)
+    a_extended = np.append(a, [(np.sum(b) - m) / nb_dummies] * nb_dummies)
+    M_extended = np.zeros((len(a_extended), len(b_extended)))
+    M_extended[-1, -1] = np.max(M) * 1e5
+    M_extended[:len(a), :len(b)] = M
+
+    gamma, log_emd = emd(a_extended, b_extended, M_extended, log=True,
+                         **kwargs)
+    if log_emd['warning'] is not None:
+        raise ValueError("Error in the EMD resolution: try to increase the"
+                         " number of dummy points")
+    log_emd['partial_w_dist'] = np.sum(M * gamma[:len(a), :len(b)])
+
+    if log:
+        return gamma[:len(a), :len(b)], log_emd
+    else:
+        return gamma[:len(a), :len(b)]
+
+
+def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
+    r"""
+    Solves the partial optimal transport problem for the quadratic cost
+    and returns the partial GW discrepancy
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = \arg\min_\gamma <\gamma,M>_F
+
+        s.t.
+             \gamma\geq 0 \\
+             \gamma 1 \leq a\\
+             \gamma^T 1 \leq b\\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+
+
+    where :
+
+    - M is the metric cost matrix
+    - a and b are source and target unbalanced distributions
+    - m is the amount of mass to be transported
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,)
+        Unnormalized histograms of dimension dim_b
+    M : np.ndarray (dim_a, dim_b)
+        cost matrix for the quadratic cost
+    m : float, optional
+        amount of mass to be transported
+    nb_dummies : int, optional, default:1
+        number of reservoir points to be added (to avoid numerical
+        instabilities, increase its value if an error is raised)
+    log : bool, optional
+        record log if True
+    **kwargs : dict
+        parameters can be directly passed to the emd solver
+
+
+    .. warning::
+        When dealing with a large number of points, the EMD solver may face
+        some instabilities, especially when the mass associated to the dummy
+        point is large. To avoid them, increase the number of dummy points
+        (allows a smoother repartition of the mass over the points).
+
+
+    Returns
+    -------
+    :math:`gamma` : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.1, .2]
+    >>> b=[.1, .1]
+    >>> M=[[0., 1.], [2., 3.]]
+    >>> np.round(partial_wasserstein2(a, b, M), 1)
+    0.3
+    >>> np.round(partial_wasserstein2(a,b,M,m=0.1), 1)
+    0.0
+
+    References
+    ----------
+    ..  [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in
+        optimal transport and Monge-Ampere obstacle problems. Annals of
+        mathematics, 673-730.
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+    """
+
+    partial_gw, log_w = partial_wasserstein(a, b, M, m, nb_dummies, log=True,
+                                            **kwargs)
+
+    log_w['T'] = partial_gw
+
+    if log:
+        return np.sum(partial_gw * M), log_w
+    else:
+        return np.sum(partial_gw * M)
+
+
+def gwgrad_partial(C1, C2, T):
+    """Compute the GW gradient. Note: we can not use the trick in [12]_  as
+    the marginals may not sum to 1.
+
+    Parameters
+    ----------
+    C1: array of shape (n_p,n_p)
+        intra-source (P) cost matrix
+
+    C2: array of shape (n_u,n_u)
+        intra-target (U) cost matrix
+
+    T : array of shape(n_p+nb_dummies, n_u) (default: None)
+        Transport matrix
+
+    Returns
+    -------
+    numpy.array of shape (n_p+nb_dummies, n_u)
+        gradient
+
+    References
+    ----------
+    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
+    """
+    cC1 = np.dot(C1 ** 2 / 2, np.dot(T, np.ones(C2.shape[0]).reshape(-1, 1)))
+    cC2 = np.dot(np.dot(np.ones(C1.shape[0]).reshape(1, -1), T), C2 ** 2 / 2)
+    constC = cC1 + cC2
+    A = -np.dot(C1, T).dot(C2.T)
+    tens = constC + A
+    return tens * 2
+
+
+def gwloss_partial(C1, C2, T):
+    """Compute the GW loss.
+
+    Parameters
+    ----------
+    C1: array of shape (n_p,n_p)
+        intra-source (P) cost matrix
+
+    C2: array of shape (n_u,n_u)
+        intra-target (U) cost matrix
+
+    T : array of shape(n_p+nb_dummies, n_u) (default: None)
+        Transport matrix
+
+    Returns
+    -------
+    GW loss
+    """
+    g = gwgrad_partial(C1, C2, T) * 0.5
+    return np.sum(g * T)
+
+
+def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
+                               thres=1, numItermax=1000, tol=1e-7,
+                               log=False, verbose=False, **kwargs):
+    r"""
+    Solves the partial optimal transport problem
+    and returns the OT plan
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F
+
+        s.t. \gamma 1 \leq a \\
+             \gamma^T 1 \leq b \\
+             \gamma\geq 0 \\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+
+    where :
+
+    - M is the metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)
+        =\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are the sample weights
+    - m is the amount of mass to be transported
+
+    The formulation of the problem has been proposed in [29]_
+
+
+    Parameters
+    ----------
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    m : float, optional
+        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+    nb_dummies : int, optional
+        Number of dummy points to add (avoid instabilities in the EMD solver)
+    G0 : ndarray, shape (ns, nt), optional
+        Initialisation of the transportation matrix
+    thres : float, optional
+        quantile of the gradient matrix to populate the cost matrix when 0
+        (default: 1)
+    numItermax : int, optional
+        Max number of iterations
+    tol : float, optional
+        tolerance for stopping iterations
+    log : bool, optional
+        return log if True
+    verbose : bool, optional
+        Print information along iterations
+    **kwargs : dict
+        parameters can be directly passed to the emd solver
+
+
+    Returns
+    -------
+    gamma : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+    >>> import ot
+    >>> import scipy as sp
+    >>> a = np.array([0.25] * 4)
+    >>> b = np.array([0.25] * 4)
+    >>> x = np.array([1,2,100,200]).reshape((-1,1))
+    >>> y = np.array([3,2,98,199]).reshape((-1,1))
+    >>> C1 = sp.spatial.distance.cdist(x, x)
+    >>> C2 = sp.spatial.distance.cdist(y, y)
+    >>> np.round(partial_gromov_wasserstein(C1, C2, a, b),2)
+    array([[0.  , 0.25, 0.  , 0.  ],
+           [0.25, 0.  , 0.  , 0.  ],
+           [0.  , 0.  , 0.25, 0.  ],
+           [0.  , 0.  , 0.  , 0.25]])
+    >>> np.round(partial_gromov_wasserstein(C1, C2, a, b, m=0.25),2)
+    array([[0.  , 0.  , 0.  , 0.  ],
+           [0.  , 0.  , 0.  , 0.  ],
+           [0.  , 0.  , 0.  , 0.  ],
+           [0.  , 0.  , 0.  , 0.25]])
+
+    References
+    ----------
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+
+    """
+
+    if m is None:
+        m = np.min((np.sum(p), np.sum(q)))
+    elif m < 0:
+        raise ValueError("Problem infeasible. Parameter m should be greater"
+                         " than 0.")
+    elif m > np.min((np.sum(p), np.sum(q))):
+        raise ValueError("Problem infeasible. Parameter m should lower or"
+                         " equal than min(|a|_1, |b|_1).")
+
+    if G0 is None:
+        G0 = np.outer(p, q)
+
+    dim_G_extended = (len(p) + nb_dummies, len(q) + nb_dummies)
+    q_extended = np.append(q, [(np.sum(p) - m) / nb_dummies] * nb_dummies)
+    p_extended = np.append(p, [(np.sum(q) - m) / nb_dummies] * nb_dummies)
+
+    cpt = 0
+    err = 1
+    eps = 1e-20
+    if log:
+        log = {'err': []}
+
+    while (err > tol and cpt < numItermax):
+
+        Gprev = G0
+
+        M = gwgrad_partial(C1, C2, G0)
+        M[M < eps] = np.quantile(M, thres)
+
+        M_emd = np.zeros(dim_G_extended)
+        M_emd[:len(p), :len(q)] = M
+        M_emd[-nb_dummies:, -nb_dummies:] = np.max(M) * 1e5
+        M_emd = np.asarray(M_emd, dtype=np.float64)
+
+        Gc, logemd = emd(p_extended, q_extended, M_emd, log=True, **kwargs)
+
+        if logemd['warning'] is not None:
+            raise ValueError("Error in the EMD resolution: try to increase the"
+                             " number of dummy points")
+
+        G0 = Gc[:len(p), :len(q)]
+
+        if cpt % 10 == 0:  # to speed up the computations
+            err = np.linalg.norm(G0 - Gprev)
+            if log:
+                log['err'].append(err)
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}|{:12s}'.format(
+                        'It.', 'Err', 'Loss') + '\n' + '-' * 31)
+                print('{:5d}|{:8e}|{:8e}'.format(cpt, err,
+                                                 gwloss_partial(C1, C2, G0)))
+
+        cpt += 1
+
+    if log:
+        log['partial_gw_dist'] = gwloss_partial(C1, C2, G0)
+        return G0[:len(p), :len(q)], log
+    else:
+        return G0[:len(p), :len(q)]
+
+
+def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None,
+                                thres=1, numItermax=1000, tol=1e-7,
+                                log=False, verbose=False, **kwargs):
+    r"""
+    Solves the partial optimal transport problem
+    and returns the partial Gromov-Wasserstein discrepancy
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F
+
+        s.t. \gamma 1 \leq a \\
+             \gamma^T 1 \leq b \\
+             \gamma\geq 0 \\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+
+    where :
+
+    - M is the metric cost matrix
+    - :math:`\Omega`  is the entropic regularization term
+        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are the sample weights
+    - m is the amount of mass to be transported
+
+    The formulation of the problem has been proposed in [29]_
+
+
+    Parameters
+    ----------
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    m : float, optional
+        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+    nb_dummies : int, optional
+        Number of dummy points to add (avoid instabilities in the EMD solver)
+    G0 : ndarray, shape (ns, nt), optional
+        Initialisation of the transportation matrix
+    thres : float, optional
+        quantile of the gradient matrix to populate the cost matrix when 0
+        (default: 1)
+    numItermax : int, optional
+        Max number of iterations
+    tol : float, optional
+        tolerance for stopping iterations
+    log : bool, optional
+        return log if True
+    verbose : bool, optional
+        Print information along iterations
+    **kwargs : dict
+        parameters can be directly passed to the emd solver
+
+
+    .. warning::
+        When dealing with a large number of points, the EMD solver may face
+        some instabilities, especially when the mass associated to the dummy
+        point is large. To avoid them, increase the number of dummy points
+        (allows a smoother repartition of the mass over the points).
+
+
+    Returns
+    -------
+    partial_gw_dist : (dim_a x dim_b) ndarray
+        partial GW discrepancy
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+    >>> import ot
+    >>> import scipy as sp
+    >>> a = np.array([0.25] * 4)
+    >>> b = np.array([0.25] * 4)
+    >>> x = np.array([1,2,100,200]).reshape((-1,1))
+    >>> y = np.array([3,2,98,199]).reshape((-1,1))
+    >>> C1 = sp.spatial.distance.cdist(x, x)
+    >>> C2 = sp.spatial.distance.cdist(y, y)
+    >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b),2)
+    1.69
+    >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b, m=0.25),2)
+    0.0
+
+    References
+    ----------
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+
+    """
+
+    partial_gw, log_gw = partial_gromov_wasserstein(C1, C2, p, q, m,
+                                                    nb_dummies, G0, thres,
+                                                    numItermax, tol, True,
+                                                    verbose, **kwargs)
+
+    log_gw['T'] = partial_gw
+
+    if log:
+        return log_gw['partial_gw_dist'], log_gw
+    else:
+        return log_gw['partial_gw_dist']
+
+
+def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000,
+                                 stopThr=1e-100, verbose=False, log=False):
+    r"""
+    Solves the partial optimal transport problem
+    and returns the OT plan
+
+    The function considers the following problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 \leq a \\
+             \gamma^T 1 \leq b \\
+             \gamma\geq 0 \\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\
+
+    where :
+
+    - M is the metric cost matrix
+    - :math:`\Omega`  is the entropic regularization term
+        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are the sample weights
+    - m is the amount of mass to be transported
+
+    The formulation of the problem has been proposed in [3]_ (prop. 5)
+
+
+    Parameters
+    ----------
+    a : np.ndarray (dim_a,)
+        Unnormalized histogram of dimension dim_a
+    b : np.ndarray (dim_b,)
+        Unnormalized histograms of dimension dim_b
+    M : np.ndarray (dim_a, dim_b)
+        cost matrix
+    reg : float
+        Regularization term > 0
+    m : float, optional
+        Amount of mass to be transported
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshold on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+
+    Examples
+    --------
+    >>> import ot
+    >>> a = [.1, .2]
+    >>> b = [.1, .1]
+    >>> M = [[0., 1.], [2., 3.]]
+    >>> np.round(entropic_partial_wasserstein(a, b, M, 1, 0.1), 2)
+    array([[0.06, 0.02],
+           [0.01, 0.  ]])
+
+    References
+    ----------
+    .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G.
+       (2015). Iterative Bregman projections for regularized transportation
+       problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138.
+
+    See Also
+    --------
+    ot.partial.partial_wasserstein: exact Partial Wasserstein
+    """
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    dim_a, dim_b = M.shape
+    dx = np.ones(dim_a, dtype=np.float64)
+    dy = np.ones(dim_b, dtype=np.float64)
+
+    if len(a) == 0:
+        a = np.ones(dim_a, dtype=np.float64) / dim_a
+    if len(b) == 0:
+        b = np.ones(dim_b, dtype=np.float64) / dim_b
+
+    if m is None:
+        m = np.min((np.sum(a), np.sum(b))) * 1.0
+    if m < 0:
+        raise ValueError("Problem infeasible. Parameter m should be greater"
+                         " than 0.")
+    if m > np.min((np.sum(a), np.sum(b))):
+        raise ValueError("Problem infeasible. Parameter m should lower or"
+                         " equal than min(|a|_1, |b|_1).")
+
+    log_e = {'err': []}
+
+    # Next 3 lines equivalent to K=np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+    np.multiply(K, m / np.sum(K), out=K)
+
+    err, cpt = 1, 0
+
+    while (err > stopThr and cpt < numItermax):
+        Kprev = K
+        K1 = np.dot(np.diag(np.minimum(a / np.sum(K, axis=1), dx)), K)
+        K2 = np.dot(K1, np.diag(np.minimum(b / np.sum(K1, axis=0), dy)))
+        K = K2 * (m / np.sum(K2))
+
+        if np.any(np.isnan(K)) or np.any(np.isinf(K)):
+            print('Warning: numerical errors at iteration', cpt)
+            break
+        if cpt % 10 == 0:
+            err = np.linalg.norm(Kprev - K)
+            if log:
+                log_e['err'].append(err)
+            if verbose:
+                if cpt % 200 == 0:
+                    print(
+                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 11)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+        cpt = cpt + 1
+    log_e['partial_w_dist'] = np.sum(M * K)
+    if log:
+        return K, log_e
+    else:
+        return K
+
+
+def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None,
+                                        numItermax=1000, tol=1e-7, log=False,
+                                        verbose=False):
+    r"""
+    Returns the partial Gromov-Wasserstein transport between (C1,p) and (C2,q)
+
+    The function solves the following optimization problem:
+
+    .. math::
+        GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot
+            \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma)
+
+        s.t.
+             \gamma\geq 0 \\
+             \gamma 1 \leq a\\
+             \gamma^T 1 \leq b\\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+
+    where :
+
+    - C1 is the metric cost matrix in the source space
+    - C2 is the metric cost matrix in the target space
+    - p and q are the sample weights
+    - L  : quadratic loss function
+    - :math:`\Omega`  is the entropic regularization term
+        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - m is the amount of mass to be transported
+
+    The formulation of the GW problem has been proposed in [12]_ and the
+    partial GW in [29]_.
+
+    Parameters
+    ----------
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    reg: float
+        entropic regularization parameter
+    m : float, optional
+        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+    G0 : ndarray, shape (ns, nt), optional
+        Initialisation of the transportation matrix
+    numItermax : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    log : bool, optional
+        return log if True
+    verbose : bool, optional
+        Print information along iterations
+
+    Examples
+    --------
+    >>> import ot
+    >>> import scipy as sp
+    >>> a = np.array([0.25] * 4)
+    >>> b = np.array([0.25] * 4)
+    >>> x = np.array([1,2,100,200]).reshape((-1,1))
+    >>> y = np.array([3,2,98,199]).reshape((-1,1))
+    >>> C1 = sp.spatial.distance.cdist(x, x)
+    >>> C2 = sp.spatial.distance.cdist(y, y)
+    >>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b,50), 2)
+    array([[0.12, 0.13, 0.  , 0.  ],
+           [0.13, 0.12, 0.  , 0.  ],
+           [0.  , 0.  , 0.25, 0.  ],
+           [0.  , 0.  , 0.  , 0.25]])
+    >>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b, 50, m=0.25), 2)
+    array([[0.02, 0.03, 0.  , 0.03],
+           [0.03, 0.03, 0.  , 0.03],
+           [0.  , 0.  , 0.03, 0.  ],
+           [0.02, 0.02, 0.  , 0.03]])
+
+    Returns
+    -------
+    :math: `gamma` : (dim_a x dim_b) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+    References
+    ----------
+    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+
+    See Also
+    --------
+    ot.partial.partial_gromov_wasserstein: exact Partial Gromov-Wasserstein
+    """
+
+    if G0 is None:
+        G0 = np.outer(p, q)
+
+    if m is None:
+        m = np.min((np.sum(p), np.sum(q)))
+    elif m < 0:
+        raise ValueError("Problem infeasible. Parameter m should be greater"
+                         " than 0.")
+    elif m > np.min((np.sum(p), np.sum(q))):
+        raise ValueError("Problem infeasible. Parameter m should lower or"
+                         " equal than min(|a|_1, |b|_1).")
+
+    cpt = 0
+    err = 1
+
+    loge = {'err': []}
+
+    while (err > tol and cpt < numItermax):
+        Gprev = G0
+        M_entr = gwgrad_partial(C1, C2, G0)
+        G0 = entropic_partial_wasserstein(p, q, M_entr, reg, m)
+        if cpt % 10 == 0:  # to speed up the computations
+            err = np.linalg.norm(G0 - Gprev)
+            if log:
+                loge['err'].append(err)
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}|{:12s}'.format(
+                        'It.', 'Err', 'Loss') + '\n' + '-' * 31)
+                print('{:5d}|{:8e}|{:8e}'.format(cpt, err,
+                                                 gwloss_partial(C1, C2, G0)))
+
+        cpt += 1
+
+    if log:
+        loge['partial_gw_dist'] = gwloss_partial(C1, C2, G0)
+        return G0, loge
+    else:
+        return G0
+
+
+def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None,
+                                         numItermax=1000, tol=1e-7, log=False,
+                                         verbose=False):
+    r"""
+    Returns the partial Gromov-Wasserstein discrepancy between (C1,p) and
+    (C2,q)
+
+    The function solves the following optimization problem:
+
+    .. math::
+        GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot
+            \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma)
+
+        s.t.
+             \gamma\geq 0 \\
+             \gamma 1 \leq a\\
+             \gamma^T 1 \leq b\\
+             1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\}
+
+    where :
+
+    - C1 is the metric cost matrix in the source space
+    - C2 is the metric cost matrix in the target space
+    - p and q are the sample weights
+    - L  : quadratic loss function
+    - :math:`\Omega`  is the entropic regularization term
+        :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - m is the amount of mass to be transported
+
+    The formulation of the GW problem has been proposed in [12]_ and the
+    partial GW in [29]_.
+
+
+    Parameters
+    ----------
+    C1 : ndarray, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : ndarray, shape (nt, nt)
+        Metric costfr matrix in the target space
+    p : ndarray, shape (ns,)
+        Distribution in the source space
+    q : ndarray, shape (nt,)
+        Distribution in the target space
+    reg: float
+        entropic regularization parameter
+    m : float, optional
+        Amount of mass to be transported (default: min (|p|_1, |q|_1))
+    G0 : ndarray, shape (ns, nt), optional
+        Initialisation of the transportation matrix
+    numItermax : int, optional
+        Max number of iterations
+    tol : float, optional
+        Stop threshold on error (>0)
+    log : bool, optional
+        return log if True
+    verbose : bool, optional
+        Print information along iterations
+
+
+    Returns
+    -------
+    partial_gw_dist: float
+        Gromov-Wasserstein distance
+    log : dict
+        log dictionary returned only if `log` is `True`
+
+    Examples
+    --------
+    >>> import ot
+    >>> import scipy as sp
+    >>> a = np.array([0.25] * 4)
+    >>> b = np.array([0.25] * 4)
+    >>> x = np.array([1,2,100,200]).reshape((-1,1))
+    >>> y = np.array([3,2,98,199]).reshape((-1,1))
+    >>> C1 = sp.spatial.distance.cdist(x, x)
+    >>> C2 = sp.spatial.distance.cdist(y, y)
+    >>> np.round(entropic_partial_gromov_wasserstein2(C1, C2, a, b,50), 2)
+    1.87
+
+    References
+    ----------
+    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+        "Gromov-Wasserstein averaging of kernel and distance matrices."
+        International Conference on Machine Learning (ICML). 2016.
+    ..  [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov-
+        Wasserstein with Applications on Positive-Unlabeled Learning".
+        arXiv preprint arXiv:2002.08276.
+    """
+
+    partial_gw, log_gw = entropic_partial_gromov_wasserstein(C1, C2, p, q, reg,
+                                                             m, G0, numItermax,
+                                                             tol, True,
+                                                             verbose)
+
+    log_gw['T'] = partial_gw
+
+    if log:
+        return log_gw['partial_gw_dist'], log_gw
+    else:
+        return log_gw['partial_gw_dist']
diff --git a/ot/plot.py b/ot/plot.py
index f403e98..ad436b4 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -78,9 +78,10 @@ def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs):
     thr : float, optional
         threshold above which the line is drawn
     **kwargs : dict
-        paameters given to the plot functions (default color is black if
+        parameters given to the plot functions (default color is black if
         nothing given)
     """
+
     if ('color' not in kwargs) and ('c' not in kwargs):
         kwargs['color'] = 'k'
     mx = G.max()
diff --git a/ot/smooth.py b/ot/smooth.py
index 5a8e4b5..81f6a3e 100644
--- a/ot/smooth.py
+++ b/ot/smooth.py
@@ -26,7 +26,9 @@
 #         Remi Flamary <remi.flamary@unice.fr>
 
 """
-Implementation of
+Smooth and Sparse Optimal Transport solvers (KL an L2 reg.)
+
+Implementation of :
 Smooth and Sparse Optimal Transport.
 Mathieu Blondel, Vivien Seguy, Antoine Rolet.
 In Proc. of AISTATS 2018.
diff --git a/ot/unbalanced.py b/ot/unbalanced.py
index d516dfc..e37f10c 100644
--- a/ot/unbalanced.py
+++ b/ot/unbalanced.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Regularized Unbalanced OT
+Regularized Unbalanced OT solvers
 """
 
 # Author: Hicham Janati <hicham.janati@inria.fr>
@@ -384,10 +384,9 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
 
     fi = reg_m / (reg_m + reg)
 
-    cpt = 0
     err = 1.
 
-    while (err > stopThr and cpt < numItermax):
+    for i in range(numItermax):
         uprev = u
         vprev = v
 
@@ -401,28 +400,27 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000,
                 or np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
-            warnings.warn('Numerical errors at iteration %s' % cpt)
+            warnings.warn('Numerical errors at iteration %s' % i)
             u = uprev
             v = vprev
             break
-        if cpt % 10 == 0:
-            # we can speed up the process by checking for the error only all
-            # the 10th iterations
-            err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.)
-            err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.)
-            err = 0.5 * (err_u + err_v)
-            if log:
-                log['err'].append(err)
+
+        err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.)
+        err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.)
+        err = 0.5 * (err_u + err_v)
+        if log:
+            log['err'].append(err)
             if verbose:
-                if cpt % 200 == 0:
+                if i % 50 == 0:
                     print(
                         '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
-                print('{:5d}|{:8e}|'.format(cpt, err))
-        cpt += 1
+                print('{:5d}|{:8e}|'.format(i, err))
+        if err < stopThr:
+            break
 
     if log:
-        log['logu'] = np.log(u + 1e-16)
-        log['logv'] = np.log(v + 1e-16)
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
 
     if n_hists:  # return only loss
         res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
@@ -747,8 +745,8 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
     alpha = np.zeros(dim)
     beta = np.zeros(dim)
     q = np.ones(dim) / dim
-    while (err > stopThr and cpt < numItermax):
-        qprev = q
+    for i in range(numItermax):
+        qprev = q.copy()
         Kv = K.dot(v)
         f_alpha = np.exp(- alpha / (reg + reg_m))
         f_beta = np.exp(- beta / (reg + reg_m))
@@ -777,7 +775,7 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
             warnings.warn('Numerical errors at iteration %s' % cpt)
             q = qprev
             break
-        if (cpt % 10 == 0 and not absorbing) or cpt == 0:
+        if (i % 10 == 0 and not absorbing) or i == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
             err = abs(q - qprev).max() / max(abs(q).max(),
@@ -785,20 +783,21 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3,
             if log:
                 log['err'].append(err)
             if verbose:
-                if cpt % 50 == 0:
+                if i % 50 == 0:
                     print(
                         '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
-                print('{:5d}|{:8e}|'.format(cpt, err))
+                print('{:5d}|{:8e}|'.format(i, err))
+            if err < stopThr:
+                break
 
-        cpt += 1
     if err > stopThr:
         warnings.warn("Stabilized Unbalanced Sinkhorn did not converge." +
                       "Try a larger entropy `reg` or a lower mass `reg_m`." +
                       "Or a larger absorption threshold `tau`.")
     if log:
-        log['niter'] = cpt
-        log['logu'] = np.log(u + 1e-16)
-        log['logv'] = np.log(v + 1e-16)
+        log['niter'] = i
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
         return q, log
     else:
         return q
@@ -882,15 +881,15 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
 
     fi = reg_m / (reg_m + reg)
 
-    v = np.ones((dim, n_hists)) / dim
-    u = np.ones((dim, 1)) / dim
-
-    cpt = 0
+    v = np.ones((dim, n_hists))
+    u = np.ones((dim, 1))
+    q = np.ones(dim)
     err = 1.
 
-    while (err > stopThr and cpt < numItermax):
-        uprev = u
-        vprev = v
+    for i in range(numItermax):
+        uprev = u.copy()
+        vprev = v.copy()
+        qprev = q.copy()
 
         Kv = K.dot(v)
         u = (A / Kv) ** fi
@@ -905,31 +904,30 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None,
                 or np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
-            warnings.warn('Numerical errors at iteration %s' % cpt)
+            warnings.warn('Numerical errors at iteration %s' % i)
             u = uprev
             v = vprev
+            q = qprev
             break
-        if cpt % 10 == 0:
-            # we can speed up the process by checking for the error only all
-            # the 10th iterations
-            err_u = abs(u - uprev).max()
-            err_u /= max(abs(u).max(), abs(uprev).max(), 1.)
-            err_v = abs(v - vprev).max()
-            err_v /= max(abs(v).max(), abs(vprev).max(), 1.)
-            err = 0.5 * (err_u + err_v)
-            if log:
-                log['err'].append(err)
-            if verbose:
-                if cpt % 50 == 0:
-                    print(
-                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
-                print('{:5d}|{:8e}|'.format(cpt, err))
+            # compute change in barycenter
+        err = abs(q - qprev).max()
+        err /= max(abs(q).max(), abs(qprev).max(), 1.)
+        if log:
+            log['err'].append(err)
+        # if barycenter did not change + at least 10 iterations - stop
+        if err < stopThr and i > 10:
+            break
+
+        if verbose:
+            if i % 10 == 0:
+                print(
+                    '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+            print('{:5d}|{:8e}|'.format(i, err))
 
-    cpt += 1
     if log:
-        log['niter'] = cpt
-        log['logu'] = np.log(u + 1e-16)
-        log['logv'] = np.log(v + 1e-16)
+        log['niter'] = i
+        log['logu'] = np.log(u + 1e-300)
+        log['logv'] = np.log(v + 1e-300)
         return q, log
     else:
         return q
@@ -1002,12 +1000,14 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None,
 
     if method.lower() == 'sinkhorn':
         return barycenter_unbalanced_sinkhorn(A, M, reg, reg_m,
+                                              weights=weights,
                                               numItermax=numItermax,
                                               stopThr=stopThr, verbose=verbose,
                                               log=log, **kwargs)
 
     elif method.lower() == 'sinkhorn_stabilized':
         return barycenter_unbalanced_stabilized(A, M, reg, reg_m,
+                                                weights=weights,
                                                 numItermax=numItermax,
                                                 stopThr=stopThr,
                                                 verbose=verbose,
@@ -1015,6 +1015,7 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None,
     elif method.lower() in ['sinkhorn_reg_scaling']:
         warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp')
         return barycenter_unbalanced(A, M, reg, reg_m,
+                                     weights=weights,
                                      numItermax=numItermax,
                                      stopThr=stopThr, verbose=verbose,
                                      log=log, **kwargs)
diff --git a/ot/utils.py b/ot/utils.py
index b71458b..f9911a1 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -49,6 +49,12 @@ def kernel(x1, x2, method='gaussian', sigma=1, **kwargs):
     return K
 
 
+def laplacian(x):
+    """Compute Laplacian matrix"""
+    L = np.diag(np.sum(x, axis=0)) - x
+    return L
+
+
 def unif(n):
     """ return a uniform histogram of length n (simplex)
 
@@ -200,6 +206,28 @@ def dots(*args):
     return reduce(np.dot, args)
 
 
+def label_normalization(y, start=0):
+    """ Transform labels to start at a given value
+
+    Parameters
+    ----------
+    y : array-like, shape (n, )
+        The vector of labels to be normalized.
+    start : int
+        Desired value for the smallest label in y (default=0)
+
+    Returns
+    -------
+    y : array-like, shape (n1, )
+        The input vector of labels normalized according to given start value.
+    """
+
+    diff = np.min(np.unique(y)) - start
+    if diff != 0:
+        y -= diff
+    return y
+
+
 def fun(f, q_in, q_out):
     """ Utility function for parmap with no serializing problems """
     while True: