Merge remote-tracking branch 'rflamary/master'

merge pot
author: tvayer <titouan.vayer@gmail.com> 2019-05-29 14:24:05 +0200
committer: tvayer <titouan.vayer@gmail.com> 2019-05-29 14:24:05 +0200
commit: 63bbeb34e48f02c97a762dab5232158d90a5cffc (patch)
tree: 853026b5854b6e4b01fdf750db139985b3dd596f /ot
parent: f70aabfcc11f92181e0dc987b341bad8ec030d75 (diff)
parent: f66ab58c7c895011fd37bafd3e848828399c56c4 (diff)
14 files changed, 1822 insertions, 612 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index 995ce33..b74b924 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -19,7 +19,7 @@ from . import datasets
 from . import da
 from . import gromov
 from . import smooth
-
+from . import stochastic
 
 # OT functions
 from .lp import emd, emd2
@@ -29,7 +29,7 @@ from .da import sinkhorn_lpl1_mm
 # utils functions
 from .utils import dist, unif, tic, toc, toq
 
-__version__ = "0.4.0"
+__version__ = "0.5.1"
 
 __all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
diff --git a/ot/bregman.py b/ot/bregman.py
index ffa6202..321712b 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -11,6 +11,7 @@ Bregman projections for regularized OT
 # License: MIT License
 
 import numpy as np
+from .utils import unif, dist
 
 
 def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
@@ -49,7 +50,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     reg : float
         Regularization term >0
     method : str
-        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        method used for the solver either 'sinkhorn', 'greenkhorn', 'sinkhorn_stabilized' or
         'sinkhorn_epsilon_scaling', see those function for specific parameters
     numItermax : int, optional
         Max number of iterations
@@ -105,6 +106,10 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
         def sink():
             return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
                                   stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'greenkhorn':
+        def sink():
+            return greenkhorn(a, b, M, reg, numItermax=numItermax,
+                              stopThr=stopThr, verbose=verbose, log=log)
     elif method.lower() == 'sinkhorn_stabilized':
         def sink():
             return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
@@ -118,7 +123,8 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
         print('Warning : unknown method using classic Sinkhorn Knopp')
 
         def sink():
-            return sinkhorn_knopp(a, b, M, reg, **kwargs)
+            return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
+                                  stopThr=stopThr, verbose=verbose, log=log, **kwargs)
 
     return sink()
 
@@ -199,6 +205,8 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
 
+       [21] Altschuler J., Weed J., Rigollet P. : Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration, Advances in Neural Information Processing Systems (NIPS) 31, 2017
+
 
 
     See Also
@@ -206,6 +214,7 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     ot.lp.emd : Unregularized OT
     ot.optim.cg : General regularized OT
     ot.bregman.sinkhorn_knopp : Classic Sinkhorn [2]
+    ot.bregman.greenkhorn : Greenkhorn [21]
     ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn [9][10]
     ot.bregman.sinkhorn_epsilon_scaling: Sinkhorn with epslilon scaling [9][10]
 
@@ -346,8 +355,13 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
 
     # print(reg)
 
-    K = np.exp(-M / reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
     # print(np.min(K))
+    tmp2 = np.empty(b.shape, dtype=M.dtype)
 
     Kp = (1 / a).reshape(-1, 1) * K
     cpt = 0
@@ -355,13 +369,14 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     while (err > stopThr and cpt < numItermax):
         uprev = u
         vprev = v
+
         KtransposeU = np.dot(K.T, u)
         v = np.divide(b, KtransposeU)
         u = 1. / np.dot(Kp, v)
 
-        if (np.any(KtransposeU == 0) or
-                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
-                np.any(np.isinf(u)) or np.any(np.isinf(v))):
+        if (np.any(KtransposeU == 0)
+                or np.any(np.isnan(u)) or np.any(np.isnan(v))
+                or np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
@@ -375,8 +390,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
                 err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
                     np.sum((v - vprev)**2) / np.sum((v)**2)
             else:
-                transp = u.reshape(-1, 1) * (K * v)
-                err = np.linalg.norm((np.sum(transp, axis=0) - b))**2
+                # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
+                np.einsum('i,ij,j->j', u, K, v, out=tmp2)
+                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
             if log:
                 log['err'].append(err)
 
@@ -391,10 +407,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         log['v'] = v
 
     if nbb:  # return only loss
-        res = np.zeros((nbb))
-        for i in range(nbb):
-            res[i] = np.sum(
-                u[:, i].reshape((-1, 1)) * K * v[:, i].reshape((1, -1)) * M)
+        res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
         if log:
             return res, log
         else:
@@ -408,6 +421,159 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
             return u.reshape((-1, 1)) * K * v.reshape((1, -1))
 
 
+def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=False):
+    """
+    Solve the entropic regularization optimal transport problem and return the OT matrix
+
+    The algorithm used is based on the paper
+
+    Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration
+        by Jason Altschuler, Jonathan Weed, Philippe Rigollet
+        appeared at NIPS 2017
+
+    which is a stochastic version of the Sinkhorn-Knopp algorithm [2].
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Regularization term >0
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5,.5]
+    >>> b=[.5,.5]
+    >>> M=[[0.,1.],[1.,0.]]
+    >>> ot.bregman.greenkhorn(a,b,M,1)
+    array([[ 0.36552929,  0.13447071],
+           [ 0.13447071,  0.36552929]])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+       [22] J. Altschuler, J.Weed, P. Rigollet : Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration, Advances in Neural Information Processing Systems (NIPS) 31, 2017
+
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64)
+
+    if len(a) == 0:
+        a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
+
+    n = a.shape[0]
+    m = b.shape[0]
+
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty_like(M)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    u = np.full(n, 1. / n)
+    v = np.full(m, 1. / m)
+    G = u[:, np.newaxis] * K * v[np.newaxis, :]
+
+    viol = G.sum(1) - a
+    viol_2 = G.sum(0) - b
+    stopThr_val = 1
+
+    if log:
+        log = dict()
+        log['u'] = u
+        log['v'] = v
+
+    for i in range(numItermax):
+        i_1 = np.argmax(np.abs(viol))
+        i_2 = np.argmax(np.abs(viol_2))
+        m_viol_1 = np.abs(viol[i_1])
+        m_viol_2 = np.abs(viol_2[i_2])
+        stopThr_val = np.maximum(m_viol_1, m_viol_2)
+
+        if m_viol_1 > m_viol_2:
+            old_u = u[i_1]
+            u[i_1] = a[i_1] / (K[i_1, :].dot(v))
+            G[i_1, :] = u[i_1] * K[i_1, :] * v
+
+            viol[i_1] = u[i_1] * K[i_1, :].dot(v) - a[i_1]
+            viol_2 += (K[i_1, :].T * (u[i_1] - old_u) * v)
+
+        else:
+            old_v = v[i_2]
+            v[i_2] = b[i_2] / (K[:, i_2].T.dot(u))
+            G[:, i_2] = u * K[:, i_2] * v[i_2]
+            #aviol = (G@one_m - a)
+            #aviol_2 = (G.T@one_n - b)
+            viol += (-old_v + v[i_2]) * K[:, i_2] * u
+            viol_2[i_2] = v[i_2] * K[:, i_2].dot(u) - b[i_2]
+
+            #print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2)))
+
+        if stopThr_val <= stopThr:
+            break
+    else:
+        print('Warning: Algorithm did not converge')
+
+    if log:
+        log['u'] = u
+        log['v'] = v
+
+    if log:
+        return G, log
+    else:
+        return G
+
+
 def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
                         warmstart=None, verbose=False, print_period=20, log=False, **kwargs):
     """
@@ -532,13 +698,13 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
 
     def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) -
-                        beta.reshape((1, nb))) / reg)
+        return np.exp(-(M - alpha.reshape((na, 1))
+                        - beta.reshape((1, nb))) / reg)
 
     def get_Gamma(alpha, beta, u, v):
         """log space gamma computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) /
-                      reg + np.log(u.reshape((na, 1))) + np.log(v.reshape((1, nb))))
+        return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb)))
+                      / reg + np.log(u.reshape((na, 1))) + np.log(v.reshape((1, nb))))
 
     # print(np.min(K))
 
@@ -748,8 +914,8 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne
 
     def get_K(alpha, beta):
         """log space computation"""
-        return np.exp(-(M - alpha.reshape((na, 1)) -
-                        beta.reshape((1, nb))) / reg)
+        return np.exp(-(M - alpha.reshape((na, 1))
+                        - beta.reshape((1, nb))) / reg)
 
     # print(np.min(K))
     def get_reg(n):  # exponential decreasing
@@ -917,6 +1083,116 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
         return geometricBar(weights, UKv)
 
 
+def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1e-9, stabThr=1e-30, verbose=False, log=False):
+    """Compute the entropic regularized wasserstein barycenter of distributions A
+    where A is a collection of 2D images.
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn)
+    - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two dimensions of matrix :math:`\mathbf{A}`
+    - reg is the regularization strength scalar value
+
+    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [21]_
+
+    Parameters
+    ----------
+    A : np.ndarray (n,w,h)
+        n distributions (2D images) of size w x h
+    reg : float
+        Regularization term >0
+    weights : np.ndarray (n,)
+        Weights of each image on the simplex (barycentric coodinates)
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    stabThr : float, optional
+        Stabilization threshold to avoid numerical precision issue
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (w,h) ndarray
+        2D Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [21] Solomon, J., De Goes, F., Peyré, G., Cuturi, M., Butscher, A., Nguyen, A. & Guibas, L. (2015).
+    Convolutional wasserstein distances: Efficient optimal transportation on geometric domains
+    ACM Transactions on Graphics (TOG), 34(4), 66
+
+
+    """
+
+    if weights is None:
+        weights = np.ones(A.shape[0]) / A.shape[0]
+    else:
+        assert(len(weights) == A.shape[0])
+
+    if log:
+        log = {'err': []}
+
+    b = np.zeros_like(A[0, :, :])
+    U = np.ones_like(A)
+    KV = np.ones_like(A)
+
+    cpt = 0
+    err = 1
+
+    # build the convolution operator
+    t = np.linspace(0, 1, A.shape[1])
+    [Y, X] = np.meshgrid(t, t)
+    xi1 = np.exp(-(X - Y)**2 / reg)
+
+    def K(x):
+        return np.dot(np.dot(xi1, x), xi1)
+
+    while (err > stopThr and cpt < numItermax):
+
+        bold = b
+        cpt = cpt + 1
+
+        b = np.zeros_like(A[0, :, :])
+        for r in range(A.shape[0]):
+            KV[r, :, :] = K(A[r, :, :] / np.maximum(stabThr, K(U[r, :, :])))
+            b += weights[r] * np.log(np.maximum(stabThr, U[r, :, :] * KV[r, :, :]))
+        b = np.exp(b)
+        for r in range(A.shape[0]):
+            U[r, :, :] = b / np.maximum(stabThr, KV[r, :, :])
+
+        if cpt % 10 == 1:
+            err = np.sum(np.abs(bold - b))
+            # log and verbose print
+            if log:
+                log['err'].append(err)
+
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+    if log:
+        log['niter'] = cpt
+        log['U'] = U
+        return b, log
+    else:
+        return b
+
+
 def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
           stopThr=1e-3, verbose=False, log=False):
     """
@@ -1023,3 +1299,302 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
         return np.sum(K0, axis=1), log
     else:
         return np.sum(K0, axis=1)
+
+
+def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+    '''
+    Solve the entropic regularization optimal transport problem and return the
+    OT matrix from empirical data
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - :math:`M` is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : np.ndarray (ns, d)
+        samples in the source domain
+    X_t : np.ndarray (nt, d)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_s = 2
+    >>> n_t = 2
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
+    >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
+    >>> emp_sinkhorn = empirical_sinkhorn(X_s, X_t, reg, verbose=False)
+    >>> print(emp_sinkhorn)
+    >>> [[4.99977301e-01 2.26989344e-05]
+        [2.26989344e-05 4.99977301e-01]]
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+    '''
+
+    if a is None:
+        a = unif(np.shape(X_s)[0])
+    if b is None:
+        b = unif(np.shape(X_t)[0])
+
+    M = dist(X_s, X_t, metric=metric)
+
+    if log:
+        pi, log = sinkhorn(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=True, **kwargs)
+        return pi, log
+    else:
+        pi = sinkhorn(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=False, **kwargs)
+        return pi
+
+
+def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+    '''
+    Solve the entropic regularization optimal transport problem from empirical
+    data and return the OT loss
+
+
+    The function solves the following optimization problem:
+
+    .. math::
+        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - :math:`M` is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : np.ndarray (ns, d)
+        samples in the source domain
+    X_t : np.ndarray (nt, d)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_s = 2
+    >>> n_t = 2
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
+    >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
+    >>> loss_sinkhorn = empirical_sinkhorn2(X_s, X_t, reg, verbose=False)
+    >>> print(loss_sinkhorn)
+    >>> [4.53978687e-05]
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [9] Schmitzer, B. (2016). Stabilized Sparse Scaling Algorithms for Entropy Regularized Transport Problems. arXiv preprint arXiv:1610.06519.
+
+    .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
+    '''
+
+    if a is None:
+        a = unif(np.shape(X_s)[0])
+    if b is None:
+        b = unif(np.shape(X_t)[0])
+
+    M = dist(X_s, X_t, metric=metric)
+
+    if log:
+        sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_loss, log
+    else:
+        sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+        return sinkhorn_loss
+
+
+def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs):
+    '''
+    Compute the sinkhorn divergence loss from empirical data
+
+    The function solves the following optimization problems and return the
+    sinkhorn divergence :math:`S`:
+
+    .. math::
+
+        W &= \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        W_a &= \min_{\gamma_a} <\gamma_a,M_a>_F + reg\cdot\Omega(\gamma_a)
+
+        W_b &= \min_{\gamma_b} <\gamma_b,M_b>_F + reg\cdot\Omega(\gamma_b)
+
+        S &= W - 1/2 * (W_a + W_b)
+
+    .. math::
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+
+             \gamma_a 1 = a
+
+             \gamma_a^T 1= a
+
+             \gamma_a\geq 0
+
+             \gamma_b 1 = b
+
+             \gamma_b^T 1= b
+
+             \gamma_b\geq 0
+    where :
+
+    - :math:`M` (resp. :math:`M_a, M_b`) is the (ns,nt) metric cost matrix (resp (ns, ns) and (nt, nt))
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`a` and :math:`b` are source and target weights (sum to 1)
+
+
+    Parameters
+    ----------
+    X_s : np.ndarray (ns, d)
+        samples in the source domain
+    X_t : np.ndarray (nt, d)
+        samples in the target domain
+    reg : float
+        Regularization term >0
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,)
+        samples weights in the target domain
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Regularized optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_s = 2
+    >>> n_t = 4
+    >>> reg = 0.1
+    >>> X_s = np.reshape(np.arange(n_s), (n_s, 1))
+    >>> X_t = np.reshape(np.arange(0, n_t), (n_t, 1))
+    >>> emp_sinkhorn_div = empirical_sinkhorn_divergence(X_s, X_t, reg)
+    >>> print(emp_sinkhorn_div)
+    >>> [2.99977435]
+
+
+    References
+    ----------
+
+    .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences,  Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018
+    '''
+    if log:
+        sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
+
+        log = {}
+        log['sinkhorn_loss_ab'] = sinkhorn_loss_ab
+        log['sinkhorn_loss_a'] = sinkhorn_loss_a
+        log['sinkhorn_loss_b'] = sinkhorn_loss_b
+        log['log_sinkhorn_ab'] = log_ab
+        log['log_sinkhorn_a'] = log_a
+        log['log_sinkhorn_b'] = log_b
+
+        return max(0, sinkhorn_div), log
+
+    else:
+        sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs)
+
+        sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b)
+        return max(0, sinkhorn_div)
diff --git a/ot/da.py b/ot/da.py
index 48b418f..479e698 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -15,7 +15,7 @@ import scipy.linalg as linalg
 from .bregman import sinkhorn
 from .lp import emd
 from .utils import unif, dist, kernel, cost_normalization
-from .utils import check_params, deprecated, BaseEstimator
+from .utils import check_params, BaseEstimator
 from .optim import cg
 from .optim import gcg
 
@@ -473,22 +473,24 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian',
         Weight for the linear OT loss (>0)
     eta : float, optional
         Regularization term  for the linear mapping L (>0)
-    bias : bool,optional
-        Estimate linear mapping with constant bias
     kerneltype : str,optional
         kernel used by calling function ot.utils.kernel (gaussian by default)
     sigma : float, optional
         Gaussian kernel bandwidth.
+    bias : bool,optional
+        Estimate linear mapping with constant bias
+    verbose : bool, optional
+        Print information along iterations
+    verbose2 : bool, optional
+        Print information along iterations
     numItermax : int, optional
         Max number of BCD iterations
-    stopThr : float, optional
-        Stop threshold on relative loss decrease (>0)
     numInnerItermax : int, optional
         Max number of iterations (inner CG solver)
     stopInnerThr : float, optional
         Stop threshold on error (inner CG solver) (>0)
-    verbose : bool, optional
-        Print information along iterations
+    stopThr : float, optional
+        Stop threshold on relative loss decrease (>0)
     log : bool, optional
         record log if True
 
@@ -643,7 +645,8 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
     The function estimates the optimal linear operator that aligns the two
     empirical distributions. This is equivalent to estimating the closed
     form mapping between two Gaussian distributions :math:`N(\mu_s,\Sigma_s)`
-    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark 2.29 in [15].
+    and :math:`N(\mu_t,\Sigma_t)` as proposed in [14] and discussed in remark
+    2.29 in [15].
 
     The linear operator from source to target :math:`M`
 
@@ -740,288 +743,6 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
         return A, b
 
 
-@deprecated("The class OTDA is deprecated in 0.3.1 and will be "
-            "removed in 0.5"
-            "\n\tfor standard transport use class EMDTransport instead.")
-class OTDA(object):
-
-    """Class for domain adaptation with optimal transport as proposed in [5]
-
-
-    References
-    ----------
-
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
-       "Optimal Transport for Domain Adaptation," in IEEE Transactions on
-       Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-
-    """
-
-    def __init__(self, metric='sqeuclidean', norm=None):
-        """ Class initialization"""
-        self.xs = 0
-        self.xt = 0
-        self.G = 0
-        self.metric = metric
-        self.norm = norm
-        self.computed = False
-
-    def fit(self, xs, xt, ws=None, wt=None, max_iter=100000):
-        """Fit domain adaptation between samples is xs and xt
-        (with optional weights)"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = emd(ws, wt, self.M, max_iter)
-        self.computed = True
-
-    def interp(self, direction=1):
-        """Barycentric interpolation for the source (1) or target (-1) samples
-
-        This Barycentric interpolation solves for each source (resp target)
-        sample xs (resp xt) the following optimization problem:
-
-        .. math::
-            arg\min_x \sum_i \gamma_{k,i} c(x,x_i^t)
-
-        where k is the index of the sample in xs
-
-        For the moment only squared euclidean distance is provided but more
-        metric  could be used in the future.
-
-        """
-        if direction > 0:  # >0 then source to target
-            G = self.G
-            w = self.ws.reshape((self.xs.shape[0], 1))
-            x = self.xt
-        else:
-            G = self.G.T
-            w = self.wt.reshape((self.xt.shape[0], 1))
-            x = self.xs
-
-        if self.computed:
-            if self.metric == 'sqeuclidean':
-                return np.dot(G / w, x)  # weighted mean
-            else:
-                print(
-                    "Warning, metric not handled yet, using weighted average")
-                return np.dot(G / w, x)  # weighted mean
-                return None
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-    def predict(self, x, direction=1):
-        """ Out of sample mapping using the formulation from [6]
-
-        For each sample x to map, it finds the nearest source sample xs and
-        map the samle x to the position xst+(x-xs) wher xst is the barycentric
-        interpolation of source sample xs.
-
-        References
-        ----------
-
-        .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
-          Regularized discrete optimal transport. SIAM Journal on Imaging
-          Sciences, 7(3), 1853-1882.
-
-        """
-        if direction > 0:  # >0 then source to target
-            xf = self.xt
-            x0 = self.xs
-        else:
-            xf = self.xs
-            x0 = self.xt
-
-        D0 = dist(x, x0)  # dist netween new samples an source
-        idx = np.argmin(D0, 1)  # closest one
-        xf = self.interp(direction)  # interp the source samples
-        # aply the delta to the interpolation
-        return xf[idx, :] + x - x0[idx, :]
-
-
-@deprecated("The class OTDA_sinkhorn is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornTransport instead.")
-class OTDA_sinkhorn(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic
-    regularization
-
-
-    """
-
-    def fit(self, xs, xt, reg=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-        (with optional weights)"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn(ws, wt, self.M, reg, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_lpl1 is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornLpl1Transport instead.")
-class OTDA_lpl1(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic and
-    group regularization"""
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-        (with optional weights),  See ot.da.sinkhorn_lpl1_mm for fit
-        parameters"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn_lpl1_mm(ws, ys, wt, self.M, reg, eta, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_l1L2 is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornL1l2Transport instead.")
-class OTDA_l1l2(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic
-    and group lasso regularization"""
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-           (with optional weights),  See ot.da.sinkhorn_lpl1_gl for fit
-           parameters"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn_l1l2_gl(ws, ys, wt, self.M, reg, eta, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_mapping_linear is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class MappingTransport instead.")
-class OTDA_mapping_linear(OTDA):
-
-    """Class for optimal transport with joint linear mapping estimation as in
-    [8]
-    """
-
-    def __init__(self):
-        """ Class initialization"""
-
-        self.xs = 0
-        self.xt = 0
-        self.G = 0
-        self.L = 0
-        self.bias = False
-        self.computed = False
-        self.metric = 'sqeuclidean'
-
-    def fit(self, xs, xt, mu=1, eta=1, bias=False, **kwargs):
-        """ Fit domain adaptation between samples is xs and xt (with optional
-            weights)"""
-        self.xs = xs
-        self.xt = xt
-        self.bias = bias
-
-        self.ws = unif(xs.shape[0])
-        self.wt = unif(xt.shape[0])
-
-        self.G, self.L = joint_OT_mapping_linear(
-            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
-        self.computed = True
-
-    def mapping(self):
-        return lambda x: self.predict(x)
-
-    def predict(self, x):
-        """ Out of sample mapping estimated during the call to fit"""
-        if self.computed:
-            if self.bias:
-                x = np.hstack((x, np.ones((x.shape[0], 1))))
-            return x.dot(self.L)  # aply the delta to the interpolation
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-
-@deprecated("The class OTDA_mapping_kernel is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class MappingTransport instead.")
-class OTDA_mapping_kernel(OTDA_mapping_linear):
-
-    """Class for optimal transport with joint nonlinear mapping
-    estimation as in [8]"""
-
-    def fit(self, xs, xt, mu=1, eta=1, bias=False, kerneltype='gaussian',
-            sigma=1, **kwargs):
-        """ Fit domain adaptation between samples is xs and xt """
-        self.xs = xs
-        self.xt = xt
-        self.bias = bias
-
-        self.ws = unif(xs.shape[0])
-        self.wt = unif(xt.shape[0])
-        self.kernel = kerneltype
-        self.sigma = sigma
-        self.kwargs = kwargs
-
-        self.G, self.L = joint_OT_mapping_kernel(
-            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
-        self.computed = True
-
-    def predict(self, x):
-        """ Out of sample mapping estimated during the call to fit"""
-
-        if self.computed:
-            K = kernel(
-                x, self.xs, method=self.kernel, sigma=self.sigma,
-                **self.kwargs)
-            if self.bias:
-                K = np.hstack((K, np.ones((x.shape[0], 1))))
-            return K.dot(self.L)
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-
 def distribution_estimation_uniform(X):
     """estimates a uniform distribution from an array of samples X
 
@@ -1466,25 +1187,25 @@ class SinkhornTransport(BaseTransport):
         algorithm if no it has not converged
     tol : float, optional (default=10e-9)
         The precision required to stop the optimization algorithm.
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
+    verbose : bool, optional (default=False)
+        Controls the verbosity of the optimization algorithm
+    log : int, optional (default=False)
+        Controls the logs of the optimization algorithm
     metric : string, optional (default="sqeuclidean")
         The ground metric for the Wasserstein problem
     norm : string, optional (default=None)
         If given, normalize the ground metric to avoid numerical errors that
         can occur with large metric values.
-    distribution : string, optional (default="uniform")
+    distribution_estimation : callable, optional (defaults to the uniform)
         The kind of distribution estimation to employ
-    verbose : int, optional (default=0)
-        Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
-        Controls the logs of the optimization algorithm
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (defaul=np.infty)
         Controls the semi supervised mode. Transport between labeled source
-        and target samples of different classes will exhibit an infinite cost
+        and target samples of different classes will exhibit an cost defined
+        by this variable
 
     Attributes
     ----------
@@ -1569,22 +1290,19 @@ class EMDTransport(BaseTransport):
 
     Parameters
     ----------
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
     metric : string, optional (default="sqeuclidean")
         The ground metric for the Wasserstein problem
     norm : string, optional (default=None)
         If given, normalize the ground metric to avoid numerical errors that
         can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
-    verbose : int, optional (default=0)
-        Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
+    log : int, optional (default=False)
         Controls the logs of the optimization algorithm
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1669,28 +1387,32 @@ class SinkhornLpl1Transport(BaseTransport):
         Entropic regularization parameter
     reg_cl : float, optional (default=0.1)
         Class regularization parameter
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
-    metric : string, optional (default="sqeuclidean")
-        The ground metric for the Wasserstein problem
-    norm : string, optional (default=None)
-        If given, normalize the ground metric to avoid numerical errors that
-        can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
         algorithm if no it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
-    verbose : int, optional (default=0)
+    log : bool, optional (default=False)
+        Controls the logs of the optimization algorithm
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
         Controls the verbosity of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (defaul=np.infty)
         Controls the semi supervised mode. Transport between labeled source
-        and target samples of different classes will exhibit an infinite cost
+        and target samples of different classes will exhibit a cost defined by
+        limit_max.
 
     Attributes
     ----------
@@ -1786,27 +1508,28 @@ class SinkhornL1l2Transport(BaseTransport):
         Entropic regularization parameter
     reg_cl : float, optional (default=0.1)
         Class regularization parameter
-    mapping : string, optional (default="barycentric")
-        The kind of mapping to apply to transport samples from a domain into
-        another one.
-        if "barycentric" only the samples used to estimate the coupling can
-        be transported from a domain to another one.
-    metric : string, optional (default="sqeuclidean")
-        The ground metric for the Wasserstein problem
-    norm : string, optional (default=None)
-        If given, normalize the ground metric to avoid numerical errors that
-        can occur with large metric values.
-    distribution : string, optional (default="uniform")
-        The kind of distribution estimation to employ
     max_iter : int, float, optional (default=10)
         The minimum number of iteration before stopping the optimization
         algorithm if no it has not converged
     max_inner_iter : int, float, optional (default=200)
         The number of iteration in the inner loop
-    verbose : int, optional (default=0)
+    tol : float, optional (default=10e-9)
+        Stop threshold on error (inner sinkhorn solver) (>0)
+    verbose : bool, optional (default=False)
         Controls the verbosity of the optimization algorithm
-    log : int, optional (default=0)
+    log : bool, optional (default=False)
         Controls the logs of the optimization algorithm
+    metric : string, optional (default="sqeuclidean")
+        The ground metric for the Wasserstein problem
+    norm : string, optional (default=None)
+        If given, normalize the ground metric to avoid numerical errors that
+        can occur with large metric values.
+    distribution_estimation : callable, optional (defaults to the uniform)
+        The kind of distribution estimation to employ
+    out_of_sample_map : string, optional (default="ferradans")
+        The kind of out of sample mapping to apply to transport samples
+        from a domain into another one. Currently the only possible option is
+        "ferradans" which uses the method proposed in [6].
     limit_max: float, optional (default=10)
         Controls the semi supervised mode. Transport between labeled source
         and target samples of different classes will exhibit an infinite cost
@@ -1928,10 +1651,12 @@ class MappingTransport(BaseEstimator):
         Max number of iterations (inner CG solver)
     inner_tol : float, optional (default=1e-6)
         Stop threshold on error (inner CG solver) (>0)
-    verbose : bool, optional (default=False)
-        Print information along iterations
     log : bool, optional (default=False)
         record log if True
+    verbose : bool, optional (default=False)
+        Print information along iterations
+    verbose2 : bool, optional (default=False)
+        Print information along iterations
 
     Attributes
     ----------
diff --git a/ot/datasets.py b/ot/datasets.py
index 362a89b..e76e75d 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -38,9 +38,9 @@ def make_1D_gauss(n, m, s):
 
 
 @deprecated()
-def get_1D_gauss(n, m, sigma, random_state=None):
+def get_1D_gauss(n, m, sigma):
     """ Deprecated see  make_1D_gauss   """
-    return make_1D_gauss(n, m, sigma, random_state=None)
+    return make_1D_gauss(n, m, sigma)
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
diff --git a/ot/externals/funcsigs.py b/ot/externals/funcsigs.py
index c73fdc9..106bde7 100644
--- a/ot/externals/funcsigs.py
+++ b/ot/externals/funcsigs.py
@@ -126,8 +126,8 @@ def signature(obj):
                 new_params[arg_name] = param.replace(default=arg_value,
                                                      _partial_kwarg=True)
 
-            elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL) and
-                  not param._partial_kwarg):
+            elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL)
+                  and not param._partial_kwarg):
                 new_params.pop(arg_name)
 
         return sig.replace(parameters=new_params.values())
@@ -333,11 +333,11 @@ class Parameter(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        return (issubclass(other.__class__, Parameter) and
-                self._name == other._name and
-                self._kind == other._kind and
-                self._default == other._default and
-                self._annotation == other._annotation)
+        return (issubclass(other.__class__, Parameter)
+                and self._name == other._name
+                and self._kind == other._kind
+                and self._default == other._default
+                and self._annotation == other._annotation)
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -372,8 +372,8 @@ class BoundArguments(object):
     def args(self):
         args = []
         for param_name, param in self._signature.parameters.items():
-            if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY) or
-                    param._partial_kwarg):
+            if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY)
+                    or param._partial_kwarg):
                 # Keyword arguments mapped by 'functools.partial'
                 # (Parameter._partial_kwarg is True) are mapped
                 # in 'BoundArguments.kwargs', along with VAR_KEYWORD &
@@ -402,8 +402,8 @@ class BoundArguments(object):
         kwargs_started = False
         for param_name, param in self._signature.parameters.items():
             if not kwargs_started:
-                if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY) or
-                        param._partial_kwarg):
+                if (param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY)
+                        or param._partial_kwarg):
                     kwargs_started = True
                 else:
                     if param_name not in self.arguments:
@@ -432,9 +432,9 @@ class BoundArguments(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        return (issubclass(other.__class__, BoundArguments) and
-                self.signature == other.signature and
-                self.arguments == other.arguments)
+        return (issubclass(other.__class__, BoundArguments)
+                and self.signature == other.signature
+                and self.arguments == other.arguments)
 
     def __ne__(self, other):
         return not self.__eq__(other)
@@ -612,9 +612,9 @@ class Signature(object):
         raise TypeError(msg)
 
     def __eq__(self, other):
-        if (not issubclass(type(other), Signature) or
-            self.return_annotation != other.return_annotation or
-                len(self.parameters) != len(other.parameters)):
+        if (not issubclass(type(other), Signature)
+            or self.return_annotation != other.return_annotation
+                or len(self.parameters) != len(other.parameters)):
             return False
 
         other_positions = dict((param, idx)
@@ -635,8 +635,8 @@ class Signature(object):
                 except KeyError:
                     return False
                 else:
-                    if (idx != other_idx or
-                            param != other.parameters[param_name]):
+                    if (idx != other_idx
+                            or param != other.parameters[param_name]):
                         return False
 
         return True
@@ -688,8 +688,8 @@ class Signature(object):
                             raise TypeError(msg)
                         parameters_ex = (param,)
                         break
-                    elif (param.kind == _VAR_KEYWORD or
-                          param.default is not _empty):
+                    elif (param.kind == _VAR_KEYWORD
+                          or param.default is not _empty):
                         # That's fine too - we have a default value for this
                         # parameter.  So, lets start parsing `kwargs`, starting
                         # with the current parameter
@@ -755,8 +755,8 @@ class Signature(object):
                 # if it has a default value, or it is an '*args'-like
                 # parameter, left alone by the processing of positional
                 # arguments.
-                if (not partial and param.kind != _VAR_POSITIONAL and
-                        param.default is _empty):
+                if (not partial and param.kind != _VAR_POSITIONAL
+                        and param.default is _empty):
                     raise TypeError('{arg!r} parameter lacking default value'.
                                     format(arg=param_name))
 
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index a2fdd3d..deda6b1 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -1,12 +1,37 @@
 # -*- coding: utf-8 -*-
+"""
 
-from . import bregman
-from . import da
-from .bregman import sinkhorn
+This module provides GPU implementation for several OT solvers and utility 
+functions. The GPU backend in handled by `cupy 
+<https://cupy.chainer.org/>`_.
+
+By default, the functions in this module accept and return numpy arrays 
+in order to proide drop-in replacement for the other POT function but
+the transfer between CPU en GPU comes with a significant overhead.
+
+In order to get the best erformances, we recommend to give only cupy 
+arrays to the functions and desactivate the conversion to numpy of the 
+result of the function with parameter ``to_numpy=False``.
+
+"""
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
 #         Leo Gautheron <https://github.com/aje>
 #
 # License: MIT License
 
-__all__ = ["bregman", "da", "sinkhorn"]
+from . import bregman
+from . import da
+from .bregman import sinkhorn
+from .da import sinkhorn_lpl1_mm
+
+from . import utils
+from .utils import dist, to_gpu, to_np
+
+
+
+
+
+__all__ = ["utils", "dist", "sinkhorn",
+           "sinkhorn_lpl1_mm", 'bregman', 'da', 'to_gpu', 'to_np']
+
diff --git a/ot/gpu/bregman.py b/ot/gpu/bregman.py
index 47939c4..978b307 100644
--- a/ot/gpu/bregman.py
+++ b/ot/gpu/bregman.py
@@ -8,14 +8,18 @@ Bregman projections for regularized OT with GPU
 #
 # License: MIT License
 
-import numpy as np
-import cudamat
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+from . import utils
 
 
-def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
-                log=False, returnAsGPU=False):
-    r"""
-    Solve the entropic regularization optimal transport problem on GPU
+def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
+                   verbose=False, log=False, to_numpy=True, **kwargs):
+    """
+    Solve the entropic regularization optimal transport on GPU
+
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
 
     The function solves the following optimization problem:
 
@@ -40,9 +44,10 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     ----------
     a : np.ndarray (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
-        samples in the target domain
-    M_GPU : cudamat.CUDAMatrix (ns,nt)
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -54,8 +59,9 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
         Print information along iterations
     log : bool, optional
         record log if True
-    returnAsGPU : bool, optional
-        return the OT matrix as a cudamat.CUDAMatrix
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
+
 
     Returns
     -------
@@ -88,60 +94,78 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     ot.optim.cg : General regularized OT
 
     """
+
+    a = cp.asarray(a)
+    b = cp.asarray(b)
+    M = cp.asarray(M)
+
+    if len(a) == 0:
+        a = np.ones((M.shape[0],)) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],)) / M.shape[1]
+
     # init data
     Nini = len(a)
     Nfin = len(b)
 
+    if len(b.shape) > 1:
+        nbb = b.shape[1]
+    else:
+        nbb = 0
+
     if log:
         log = {'err': []}
 
     # we assume that no distances are null except those of the diagonal of
     # distances
-    u = (np.ones(Nini) / Nini).reshape((Nini, 1))
-    u_GPU = cudamat.CUDAMatrix(u)
-    a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
-    ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
-    v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
-    v_GPU = cudamat.CUDAMatrix(v)
-    b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))
-
-    M_GPU.divide(-reg)
+    if nbb:
+        u = np.ones((Nini, nbb)) / Nini
+        v = np.ones((Nfin, nbb)) / Nfin
+    else:
+        u = np.ones(Nini) / Nini
+        v = np.ones(Nfin) / Nfin
 
-    K_GPU = cudamat.exp(M_GPU)
+    # print(reg)
 
-    ones_GPU.divide(a_GPU, target=a_GPU)
-    Kp_GPU = cudamat.empty(K_GPU.shape)
-    K_GPU.mult_by_col(a_GPU, target=Kp_GPU)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
 
-    tmp_GPU = cudamat.empty(K_GPU.shape)
+    # print(np.min(K))
+    tmp2 = np.empty(b.shape, dtype=M.dtype)
 
+    Kp = (1 / a).reshape(-1, 1) * K
     cpt = 0
     err = 1
     while (err > stopThr and cpt < numItermax):
-        uprev_GPU = u_GPU.copy()
-        vprev_GPU = v_GPU.copy()
+        uprev = u
+        vprev = v
 
-        KtransposeU_GPU = K_GPU.transpose().dot(u_GPU)
-        b_GPU.divide(KtransposeU_GPU, target=v_GPU)
-        ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)
+        KtransposeU = np.dot(K.T, u)
+        v = np.divide(b, KtransposeU)
+        u = 1. / np.dot(Kp, v)
 
-        if (np.any(KtransposeU_GPU.asarray() == 0) or
-                not u_GPU.allfinite() or not v_GPU.allfinite()):
+        if (np.any(KtransposeU == 0) or
+                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
+                np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
-            u_GPU = uprev_GPU.copy()
-            v_GPU = vprev_GPU.copy()
+            u = uprev
+            v = vprev
             break
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            K_GPU.mult_by_col(u_GPU, target=tmp_GPU)
-            tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU)
-
-            bcopy_GPU = b_GPU.copy().transpose()
-            bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1)
-            err = bcopy_GPU.euclid_norm()**2
+            if nbb:
+                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
+                    np.sum((v - vprev)**2) / np.sum((v)**2)
+            else:
+                # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
+                tmp2 = np.sum(u[:, None] * K * v[None, :], 0)
+                #tmp2=np.einsum('i,ij,j->j', u, K, v)
+                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
             if log:
                 log['err'].append(err)
 
@@ -150,20 +174,32 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
                     print(
                         '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                 print('{:5d}|{:8e}|'.format(cpt, err))
-        cpt += 1
-    if log:
-        log['u'] = u_GPU.asarray()
-        log['v'] = v_GPU.asarray()
-
-    K_GPU.mult_by_col(u_GPU, target=K_GPU)
-    K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU)
-
-    if returnAsGPU:
-        res = K_GPU
-    else:
-        res = K_GPU.asarray()
-
+        cpt = cpt + 1
     if log:
-        return res, log
-    else:
-        return res
+        log['u'] = u
+        log['v'] = v
+
+    if nbb:  # return only loss
+        #res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) (explodes cupy memory)
+        res = np.empty(nbb)
+        for i in range(nbb):
+            res[i] = np.sum(u[:, None, i] * (K * M) * v[None, :, i])
+        if to_numpy:
+            res = utils.to_np(res)
+        if log:
+            return res, log
+        else:
+            return res
+
+    else:  # return OT matrix
+        res = u.reshape((-1, 1)) * K * v.reshape((1, -1))
+        if to_numpy:
+            res = utils.to_np(res)
+        if log:
+            return res, log
+        else:
+            return res
+
+
+# define sinkhorn as sinkhorn_knopp
+sinkhorn = sinkhorn_knopp
diff --git a/ot/gpu/da.py b/ot/gpu/da.py
index 71a485a..4a98038 100644
--- a/ot/gpu/da.py
+++ b/ot/gpu/da.py
@@ -11,80 +11,30 @@ Domain adaptation with optimal transport with GPU implementation
 # License: MIT License
 
 
-import numpy as np
-from ..utils import unif
-from ..da import OTDA
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+import numpy as npp
+from . import utils
+
 from .bregman import sinkhorn
-import cudamat
 
 
-def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
+def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
+                     numInnerItermax=200, stopInnerThr=1e-9, verbose=False,
+                     log=False, to_numpy=True):
     """
-    Compute the pairwise euclidean distance between matrices a and b.
-
-
-    Parameters
-    ----------
-    a : np.ndarray (n, f)
-        first matrice
-    b : np.ndarray (m, f)
-        second matrice
-    returnAsGPU : boolean, optional (default False)
-        if True, returns cudamat matrix still on GPU, else return np.ndarray
-    squared : boolean, optional (default False)
-        if True, return squared euclidean distance matrice
+    Solve the entropic regularization optimal transport problem with nonconvex
+    group lasso regularization on GPU
 
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
 
-    Returns
-    -------
-    c : (n x m) np.ndarray or cudamat.CUDAMatrix
-        pairwise euclidean distance distance matrix
-    """
-    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
-    # First compute in c_GPU the squared euclidean distance. And return its
-    # square root. At each cell [i,j] of c, we want to have
-    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
-    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).
-
-    a_GPU = cudamat.CUDAMatrix(a)
-    b_GPU = cudamat.CUDAMatrix(b)
-
-    # Multiply a by b transpose to obtain in each cell [i,j] of c the
-    # value sum{k in range(f)} ( a[i,k]b[j,k] )
-    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
-    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
-    c_GPU.mult(-2)
-
-    # Compute the vectors of the sum of squared elements.
-    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
-    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)
-
-    # Add the vectors in each columns (respectivly rows) of c.
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
-    c_GPU.add_col_vec(a_GPU)
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
-    c_GPU.add_row_vec(b_GPU.transpose())
-
-    if not squared:
-        c_GPU = cudamat.sqrt(c_GPU)
-
-    if returnAsGPU:
-        return c_GPU
-    else:
-        return c_GPU.asarray()
-
-
-def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
-                     numInnerItermax=200, stopInnerThr=1e-9,
-                     verbose=False, log=False):
-    """
-    Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)
+        + \eta \Omega_g(\gamma)
 
         s.t. \gamma 1 = a
 
@@ -94,11 +44,16 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
+    - :math:`\Omega_e` is the entropic regularization term
+        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regulaization term
+      :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
+      where  :math:`\mathcal{I}_c` are the index of samples from class c
+      in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_
+    The algorithm used for solving the problem is the generalised conditional
+    gradient as proposed in  [5]_ [7]_
 
 
     Parameters
@@ -109,7 +64,7 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
         labels of samples in the source domain
     b : np.ndarray (nt,)
         samples weights in the target domain
-    M_GPU : cudamat.CUDAMatrix (ns,nt)
+    M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term for entropic regularization >0
@@ -125,6 +80,8 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
         Print information along iterations
     log : bool, optional
         record log if True
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
 
 
     Returns
@@ -138,8 +95,13 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     References
     ----------
 
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence
+       and applications. arXiv preprint arXiv:1510.06567.
 
     See Also
     --------
@@ -148,108 +110,35 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     ot.optim.cg : General regularized OT
 
     """
+
+    a, labels_a, b, M = utils.to_gpu(a, labels_a, b, M)
+
     p = 0.5
     epsilon = 1e-3
-    Nfin = len(b)
 
     indices_labels = []
-    classes = np.unique(labels_a)
+    labels_a2 = cp.asnumpy(labels_a)
+    classes = npp.unique(labels_a2)
     for c in classes:
-        idxc, = np.where(labels_a == c)
-        indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1)))
+        idxc, = utils.to_gpu(npp.where(labels_a2 == c))
+        indices_labels.append(idxc)
 
-    Mreg_GPU = cudamat.empty(M_GPU.shape)
-    W_GPU = cudamat.empty(M_GPU.shape).assign(0)
+    W = np.zeros(M.shape)
 
     for cpt in range(numItermax):
-        Mreg_GPU.assign(M_GPU)
-        Mreg_GPU.add_mult(W_GPU, eta)
-        transp_GPU = sinkhorn(a, b, Mreg_GPU, reg, numItermax=numInnerItermax,
-                              stopThr=stopInnerThr, returnAsGPU=True)
+        Mreg = M + eta * W
+        transp = sinkhorn(a, b, Mreg, reg, numItermax=numInnerItermax,
+                          stopThr=stopInnerThr, to_numpy=False)
         # the transport has been computed. Check if classes are really
         # separated
-        W_GPU.assign(1)
-        W_GPU = W_GPU.transpose()
+        W = np.ones(M.shape)
         for (i, c) in enumerate(classes):
-            (_, nbRow) = indices_labels[i].shape
-            tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
-            transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
-            majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
-            cudamat.pow(majs_GPU, (p - 1))
-            majs_GPU.mult(p)
-
-            tmpC_GPU.assign(0)
-            tmpC_GPU.add_col_vec(majs_GPU)
-            W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU)
-
-        W_GPU = W_GPU.transpose()
-
-    return transp_GPU.asarray()
 
+            majs = np.sum(transp[indices_labels[i]], axis=0)
+            majs = p * ((majs + epsilon)**(p - 1))
+            W[indices_labels[i]] = majs
 
-class OTDA_GPU(OTDA):
-
-    def normalizeM(self, norm):
-        if norm == "median":
-            self.M_GPU.divide(float(np.median(self.M_GPU.asarray())))
-        elif norm == "max":
-            self.M_GPU.divide(float(np.max(self.M_GPU.asarray())))
-        elif norm == "log":
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-        elif norm == "loglog":
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-
-
-class OTDA_sinkhorn(OTDA_GPU):
-
-    def fit(self, xs, xt, reg=1, ws=None, wt=None, norm=None, **kwargs):
-        cudamat.init()
-        xs = np.asarray(xs, dtype=np.float64)
-        xt = np.asarray(xt, dtype=np.float64)
-
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M_GPU = pairwiseEuclideanGPU(xs, xt, returnAsGPU=True,
-                                          squared=True)
-        self.normalizeM(norm)
-        self.G = sinkhorn(ws, wt, self.M_GPU, reg, **kwargs)
-        self.computed = True
-
-
-class OTDA_lpl1(OTDA_GPU):
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, norm=None,
-            **kwargs):
-        cudamat.init()
-        xs = np.asarray(xs, dtype=np.float64)
-        xt = np.asarray(xt, dtype=np.float64)
-
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M_GPU = pairwiseEuclideanGPU(xs, xt, returnAsGPU=True,
-                                          squared=True)
-        self.normalizeM(norm)
-        self.G = sinkhorn_lpl1_mm(ws, ys, wt, self.M_GPU, reg, eta, **kwargs)
-        self.computed = True
+    if to_numpy:
+        return utils.to_np(transp)
+    else:
+        return transp
diff --git a/ot/gpu/utils.py b/ot/gpu/utils.py
new file mode 100644
index 0000000..41e168a
--- /dev/null
+++ b/ot/gpu/utils.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+"""
+Utility functions for GPU
+"""
+
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Nicolas Courty <ncourty@irisa.fr>
+#         Leo Gautheron <https://github.com/aje>
+#
+# License: MIT License
+
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+
+
+def euclidean_distances(a, b, squared=False, to_numpy=True):
+    """
+    Compute the pairwise euclidean distance between matrices a and b.
+
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
+
+    Parameters
+    ----------
+    a : np.ndarray (n, f)
+        first matrix
+    b : np.ndarray (m, f)
+        second matrix
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
+    squared : boolean, optional (default False)
+        if True, return squared euclidean distance matrix
+
+    Returns
+    -------
+    c : (n x m) np.ndarray or cupy.ndarray
+        pairwise euclidean distance distance matrix
+    """
+
+    a, b = to_gpu(a, b)
+
+    a2 = np.sum(np.square(a), 1)
+    b2 = np.sum(np.square(b), 1)
+
+    c = -2 * np.dot(a, b.T)
+    c += a2[:, None]
+    c += b2[None, :]
+
+    if not squared:
+        np.sqrt(c, out=c)
+    if to_numpy:
+        return to_np(c)
+    else:
+        return c
+
+
+def dist(x1, x2=None, metric='sqeuclidean', to_numpy=True):
+    """Compute distance between samples in x1 and x2 on gpu
+
+    Parameters
+    ----------
+
+    x1 : np.array (n1,d)
+        matrix with n1 samples of size d
+    x2 : np.array (n2,d), optional
+        matrix with n2 samples of size d (if None then x2=x1)
+    metric : str
+        Metric from 'sqeuclidean', 'euclidean',
+
+
+    Returns
+    -------
+
+    M : np.array (n1,n2)
+        distance matrix computed with given metric
+
+    """
+    if x2 is None:
+        x2 = x1
+    if metric == "sqeuclidean":
+        return euclidean_distances(x1, x2, squared=True, to_numpy=to_numpy)
+    elif metric == "euclidean":
+        return euclidean_distances(x1, x2, squared=False, to_numpy=to_numpy)
+    else:
+        raise NotImplementedError
+
+
+def to_gpu(*args):
+    """ Upload numpy arrays to GPU and return them"""
+    if len(args) > 1:
+        return (cp.asarray(x) for x in args)
+    else:
+        return cp.asarray(args[0])
+
+
+def to_np(*args):
+    """ convert GPU arras to numpy and return them"""
+    if len(args) > 1:
+        return (cp.asnumpy(x) for x in args)
+    else:
+        return cp.asnumpy(args[0])
diff --git a/ot/gromov.py b/ot/gromov.py
index fe4fc15..33134a2 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -40,7 +40,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
             * h1(a)=a
             * h2(b)=b
 
-    The kl-loss function L(a,b)=(1/2)*|a-b|^2 is read as :
+    The kl-loss function L(a,b)=a*log(a/b)-a+b is read as :
         L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
             * f1(a)=a*log(a)-a
             * f2(b)=b
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 4c0d170..02cbd8c 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -17,8 +17,9 @@ from .import cvx
 from .emd_wrap import emd_c, check_result
 from ..utils import parmap
 from .cvx import barycenter
+from ..utils import dist
 
-__all__=['emd', 'emd2', 'barycenter', 'cvx']
+__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
 
 
 def emd(a, b, M, numItermax=100000, log=False):
@@ -216,3 +217,95 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
 
     res = parmap(f, [b[:, i] for i in range(nb)], processes)
     return res
+
+
+
+def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None):
+    """
+    Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance)
+
+    The function solves the Wasserstein barycenter problem when the barycenter measure is constrained to be supported on k atoms.
+    This problem is considered in [1] (Algorithm 2). There are two differences with the following codes:
+    - we do not optimize over the weights
+    - we do not do line search for the locations updates, we use i.e. theta = 1 in [1] (Algorithm 2). This can be seen as a discrete implementation of the fixed-point algorithm of [2] proposed in the continuous setting.
+
+    Parameters
+    ----------
+    measures_locations : list of (k_i,d) np.ndarray
+        The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
+    measures_weights : list of (k_i,) np.ndarray
+        Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
+
+    X_init : (k,d) np.ndarray
+        Initialization of the support locations (on k atoms) of the barycenter
+    b : (k,) np.ndarray
+        Initialization of the weights of the barycenter (non-negatives, sum to 1)
+    weights : (k,) np.ndarray
+        Initialization of the coefficients of the barycenter (non-negatives, sum to 1)
+
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+    X : (k,d) np.ndarray
+        Support locations (on k atoms) of the barycenter
+
+    References
+    ----------
+
+    .. [1] Cuturi, Marco, and Arnaud Doucet. "Fast computation of Wasserstein barycenters." International Conference on Machine Learning. 2014.
+
+    .. [2]  Álvarez-Esteban, Pedro C., et al. "A fixed-point approach to barycenters in Wasserstein space." Journal of Mathematical Analysis and Applications 441.2 (2016): 744-762.
+
+    """
+
+    iter_count = 0
+
+    N = len(measures_locations)
+    k = X_init.shape[0]
+    d = X_init.shape[1]
+    if b is None:
+        b = np.ones((k,))/k
+    if weights is None:
+        weights = np.ones((N,)) / N
+
+    X = X_init
+
+    log_dict = {}
+    displacement_square_norms = []
+
+    displacement_square_norm = stopThr + 1.
+
+    while ( displacement_square_norm > stopThr and iter_count < numItermax ):
+
+        T_sum = np.zeros((k, d))
+
+        for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, weights.tolist()):
+
+            M_i = dist(X, measure_locations_i)
+            T_i = emd(b, measure_weights_i, M_i)
+            T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
+
+        displacement_square_norm = np.sum(np.square(T_sum-X))
+        if log:
+            displacement_square_norms.append(displacement_square_norm)
+
+        X = T_sum
+
+        if verbose:
+            print('iteration %d, displacement_square_norm=%f\n', iter_count, displacement_square_norm)
+
+        iter_count += 1
+
+    if log:
+        log_dict['displacement_square_norms'] = displacement_square_norms
+        return X, log_dict
+    else:
+        return X
+\ No newline at end of file
diff --git a/ot/lp/cvx.py b/ot/lp/cvx.py
index c8c75bc..8e763be 100644
--- a/ot/lp/cvx.py
+++ b/ot/lp/cvx.py
@@ -11,6 +11,7 @@ import numpy as np
 import scipy as sp
 import scipy.sparse as sps
 
+
 try:
     import cvxopt
     from cvxopt import solvers, matrix, spmatrix
@@ -26,7 +27,7 @@ def scipy_sparse_to_spmatrix(A):
 
 
 def barycenter(A, M, weights=None, verbose=False, log=False, solver='interior-point'):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+    """Compute the Wasserstein barycenter of distributions A
 
      The function solves the following optimization problem [16]:
 
diff --git a/ot/stochastic.py b/ot/stochastic.py
new file mode 100644
index 0000000..85c4230
--- /dev/null
+++ b/ot/stochastic.py
@@ -0,0 +1,736 @@
+# Author: Kilian Fatras <kilian.fatras@gmail.com>
+#
+# License: MIT License
+
+import numpy as np
+
+
+##############################################################################
+# Optimization toolbox for SEMI - DUAL problems
+##############################################################################
+
+
+def coordinate_grad_semi_dual(b, M, reg, beta, i):
+    '''
+    Compute the coordinate gradient update for regularized discrete distributions for (i, :)
+
+    The function computes the gradient of the semi dual problem:
+
+    .. math::
+        \max_v \sum_i (\sum_j v_j * b_j - reg * log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - v is a dual variable in R^J
+    - reg is the regularization term
+    - a and b are source and target weights (sum to 1)
+
+    The algorithm used for solving the problem is the ASGD & SAG algorithms
+    as proposed in [18]_ [alg.1 & alg.2]
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float nu
+        Regularization term > 0
+    v : np.ndarray(nt,)
+        dual variable
+    i : number int
+        picked number i
+
+    Returns
+    -------
+
+    coordinate gradient : np.ndarray(nt,)
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+
+    '''
+
+    r = M[i, :] - beta
+    exp_beta = np.exp(-r / reg) * b
+    khi = exp_beta / (np.sum(exp_beta))
+    return b - khi
+
+
+def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
+    '''
+    Compute the SAG algorithm to solve the regularized discrete measures
+        optimal transport max problem
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1 = b
+
+             \gamma \geq 0
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    The algorithm used for solving the problem is the SAG algorithm
+    as proposed in [18]_ [alg.1]
+
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,),
+        source measure
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+
+    Returns
+    -------
+
+    v : np.ndarray(nt,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    if lr is None:
+        lr = 1. / max(a / reg)
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    cur_beta = np.zeros(n_target)
+    stored_gradient = np.zeros((n_source, n_target))
+    sum_stored_gradient = np.zeros(n_target)
+    for _ in range(numItermax):
+        i = np.random.randint(n_source)
+        cur_coord_grad = a[i] * coordinate_grad_semi_dual(b, M, reg,
+                                                          cur_beta, i)
+        sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
+        stored_gradient[i] = cur_coord_grad
+        cur_beta += lr * (1. / n_source) * sum_stored_gradient
+    return cur_beta
+
+
+def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
+    '''
+    Compute the ASGD algorithm to solve the regularized semi continous measures optimal transport max problem
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma \geq 0
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    The algorithm used for solving the problem is the ASGD algorithm
+    as proposed in [18]_ [alg.2]
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float number
+        Regularization term > 0
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+
+
+    Returns
+    -------
+
+    ave_v : np.ndarray(nt,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    if lr is None:
+        lr = 1. / max(a / reg)
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    cur_beta = np.zeros(n_target)
+    ave_beta = np.zeros(n_target)
+    for cur_iter in range(numItermax):
+        k = cur_iter + 1
+        i = np.random.randint(n_source)
+        cur_coord_grad = coordinate_grad_semi_dual(b, M, reg, cur_beta, i)
+        cur_beta += (lr / np.sqrt(k)) * cur_coord_grad
+        ave_beta = (1. / k) * cur_beta + (1 - 1. / k) * ave_beta
+    return ave_beta
+
+
+def c_transform_entropic(b, M, reg, beta):
+    '''
+    The goal is to recover u from the c-transform.
+
+    The function computes the c_transform of a dual variable from the other
+    dual variable:
+
+    .. math::
+        u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - u, v are dual variables in R^IxR^J
+    - reg is the regularization term
+
+    It is used to recover an optimal u from optimal v solving the semi dual
+    problem, see Proposition 2.1 of [18]_
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float
+        regularization term > 0
+    v : np.ndarray(nt,)
+        dual variable
+
+    Returns
+    -------
+
+    u : np.ndarray(ns,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    n_source = np.shape(M)[0]
+    alpha = np.zeros(n_source)
+    for i in range(n_source):
+        r = M[i, :] - beta
+        min_r = np.min(r)
+        exp_beta = np.exp(-(r - min_r) / reg) * b
+        alpha[i] = min_r - reg * np.log(np.sum(exp_beta))
+    return alpha
+
+
+def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
+                                log=False):
+    '''
+    Compute the transportation matrix to solve the regularized discrete
+        measures optimal transport max problem
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma \geq 0
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+    The algorithm used for solving the problem is the SAG or ASGD algorithms
+    as proposed in [18]_
+
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,)
+        source measure
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float number
+        Regularization term > 0
+    methode : str
+        used method (SAG or ASGD)
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+    n_source : int number
+        size of the source measure
+    n_target : int number
+        size of the target measure
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+
+    pi : np.ndarray(ns, nt)
+        transportation matrix
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    if method.lower() == "sag":
+        opt_beta = sag_entropic_transport(a, b, M, reg, numItermax, lr)
+    elif method.lower() == "asgd":
+        opt_beta = averaged_sgd_entropic_transport(a, b, M, reg, numItermax, lr)
+    else:
+        print("Please, select your method between SAG and ASGD")
+        return None
+
+    opt_alpha = c_transform_entropic(b, M, reg, opt_beta)
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
+          a[:, None] * b[None, :])
+
+    if log:
+        log = {}
+        log['alpha'] = opt_alpha
+        log['beta'] = opt_beta
+        return pi, log
+    else:
+        return pi
+
+
+##############################################################################
+# Optimization toolbox for DUAL problems
+##############################################################################
+
+
+def batch_grad_dual(a, b, M, reg, alpha, beta, batch_size, batch_alpha,
+                    batch_beta):
+    '''
+    Computes the partial gradient of the dual optimal transport problem.
+
+    For each (i,j) in a batch of coordinates, the partial gradients are :
+
+    .. math::
+        \partial_{u_i} F = u_i * b_s/l_{v} - \sum_{j \in B_v} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+
+        \partial_{v_j} F = v_j * b_s/l_{u} - \sum_{i \in B_u} exp((u_i + v_j - M_{i,j})/reg) * a_i * b_j
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - u, v are dual variables in R^ixR^J
+    - reg is the regularization term
+    - :math:`B_u` and :math:`B_v` are lists of index
+    - :math:`b_s` is the size of the batchs :math:`B_u` and :math:`B_v`
+    - :math:`l_u` and :math:`l_v` are the lenghts of :math:`B_u` and :math:`B_v`
+    - a and b are source and target weights (sum to 1)
+
+
+    The algorithm used for solving the dual problem is the SGD algorithm
+    as proposed in [19]_ [alg.1]
+
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,)
+        source measure
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float number
+        Regularization term > 0
+    alpha : np.ndarray(ns,)
+        dual variable
+    beta : np.ndarray(nt,)
+        dual variable
+    batch_size : int number
+        size of the batch
+    batch_alpha : np.ndarray(bs,)
+        batch of index of alpha
+    batch_beta : np.ndarray(bs,)
+        batch of index of beta
+
+    Returns
+    -------
+
+    grad : np.ndarray(ns,)
+        partial grad F
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    G = - (np.exp((alpha[batch_alpha, None] + beta[None, batch_beta] -
+                   M[batch_alpha, :][:, batch_beta]) / reg) *
+           a[batch_alpha, None] * b[None, batch_beta])
+    grad_beta = np.zeros(np.shape(M)[1])
+    grad_alpha = np.zeros(np.shape(M)[0])
+    grad_beta[batch_beta] = (b[batch_beta] * len(batch_alpha) / np.shape(M)[0]
+                             + G.sum(0))
+    grad_alpha[batch_alpha] = (a[batch_alpha] * len(batch_beta)
+                               / np.shape(M)[1] + G.sum(1))
+
+    return grad_alpha, grad_beta
+
+
+def sgd_entropic_regularization(a, b, M, reg, batch_size, numItermax, lr):
+    '''
+    Compute the sgd algorithm to solve the regularized discrete measures
+        optimal transport dual problem
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma \geq 0
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term with :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,)
+        source measure
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float number
+        Regularization term > 0
+    batch_size : int number
+        size of the batch
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+
+    Returns
+    -------
+
+    alpha : np.ndarray(ns,)
+        dual variable
+    beta : np.ndarray(nt,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                          batch_size,
+                                                          numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    cur_alpha = np.zeros(n_source)
+    cur_beta = np.zeros(n_target)
+    for cur_iter in range(numItermax):
+        k = np.sqrt(cur_iter + 1)
+        batch_alpha = np.random.choice(n_source, batch_size, replace=False)
+        batch_beta = np.random.choice(n_target, batch_size, replace=False)
+        update_alpha, update_beta = batch_grad_dual(a, b, M, reg, cur_alpha,
+                                                    cur_beta, batch_size,
+                                                    batch_alpha, batch_beta)
+        cur_alpha[batch_alpha] += (lr / k) * update_alpha[batch_alpha]
+        cur_beta[batch_beta] += (lr / k) * update_beta[batch_beta]
+
+    return cur_alpha, cur_beta
+
+
+def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
+                        log=False):
+    '''
+    Compute the transportation matrix to solve the regularized discrete measures
+        optimal transport dual problem
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma \geq 0
+
+    Where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,)
+        source measure
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float number
+        Regularization term > 0
+    batch_size : int number
+        size of the batch
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+
+    pi : np.ndarray(ns, nt)
+        transportation matrix
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    opt_alpha, opt_beta = sgd_entropic_regularization(a, b, M, reg, batch_size,
+                                                      numItermax, lr)
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
+          a[:, None] * b[None, :])
+    if log:
+        log = {}
+        log['alpha'] = opt_alpha
+        log['beta'] = opt_beta
+        return pi, log
+    else:
+        return pi
diff --git a/ot/utils.py b/ot/utils.py
index 7dac283..bb21b38 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -77,6 +77,34 @@ def clean_zeros(a, b, M):
     return a2, b2, M2
 
 
+def euclidean_distances(X, Y, squared=False):
+    """
+    Considering the rows of X (and Y=X) as vectors, compute the
+    distance matrix between each pair of vectors.
+    Parameters
+    ----------
+    X : {array-like}, shape (n_samples_1, n_features)
+    Y : {array-like}, shape (n_samples_2, n_features)
+    squared : boolean, optional
+        Return squared Euclidean distances.
+    Returns
+    -------
+    distances : {array}, shape (n_samples_1, n_samples_2)
+    """
+    XX = np.einsum('ij,ij->i', X, X)[:, np.newaxis]
+    YY = np.einsum('ij,ij->i', Y, Y)[np.newaxis, :]
+    distances = np.dot(X, Y.T)
+    distances *= -2
+    distances += XX
+    distances += YY
+    np.maximum(distances, 0, out=distances)
+    if X is Y:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        distances.flat[::distances.shape[0] + 1] = 0.0
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
 def dist(x1, x2=None, metric='sqeuclidean'):
     """Compute distance between samples in x1 and x2 using function scipy.spatial.distance.cdist
 
@@ -104,7 +132,8 @@ def dist(x1, x2=None, metric='sqeuclidean'):
     """
     if x2 is None:
         x2 = x1
-
+    if metric == "sqeuclidean":
+        return euclidean_distances(x1, x2, squared=True)
     return cdist(x1, x2, metric=metric)
author	tvayer <titouan.vayer@gmail.com>	2019-05-29 14:24:05 +0200
committer	tvayer <titouan.vayer@gmail.com>	2019-05-29 14:24:05 +0200
commit	63bbeb34e48f02c97a762dab5232158d90a5cffc (patch)
tree	853026b5854b6e4b01fdf750db139985b3dd596f /ot
parent	f70aabfcc11f92181e0dc987b341bad8ec030d75 (diff)
parent	f66ab58c7c895011fd37bafd3e848828399c56c4 (diff)