12 files changed, 1275 insertions, 518 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index 1dde390..b74b924 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -29,7 +29,7 @@ from .da import sinkhorn_lpl1_mm
 # utils functions
 from .utils import dist, unif, tic, toc, toq
 
-__version__ = "0.4.0"
+__version__ = "0.5.1"
 
 __all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
diff --git a/ot/bregman.py b/ot/bregman.py
index b017c1a..d1057ff 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -47,7 +47,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     reg : float
         Regularization term >0
     method : str
-        method used for the solver either 'sinkhorn',  'sinkhorn_stabilized' or
+        method used for the solver either 'sinkhorn', 'greenkhorn', 'sinkhorn_stabilized' or
         'sinkhorn_epsilon_scaling', see those function for specific parameters
     numItermax : int, optional
         Max number of iterations
@@ -103,6 +103,10 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
         def sink():
             return sinkhorn_knopp(a, b, M, reg, numItermax=numItermax,
                                   stopThr=stopThr, verbose=verbose, log=log, **kwargs)
+    elif method.lower() == 'greenkhorn':
+        def sink():
+            return greenkhorn(a, b, M, reg, numItermax=numItermax,
+                              stopThr=stopThr, verbose=verbose, log=log)
     elif method.lower() == 'sinkhorn_stabilized':
         def sink():
             return sinkhorn_stabilized(a, b, M, reg, numItermax=numItermax,
@@ -197,6 +201,8 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
 
     .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816.
 
+       [21] Altschuler J., Weed J., Rigollet P. : Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration, Advances in Neural Information Processing Systems (NIPS) 31, 2017
+
 
 
     See Also
@@ -204,6 +210,7 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     ot.lp.emd : Unregularized OT
     ot.optim.cg : General regularized OT
     ot.bregman.sinkhorn_knopp : Classic Sinkhorn [2]
+    ot.bregman.greenkhorn : Greenkhorn [21]
     ot.bregman.sinkhorn_stabilized: Stabilized sinkhorn [9][10]
     ot.bregman.sinkhorn_epsilon_scaling: Sinkhorn with epslilon scaling [9][10]
 
@@ -344,8 +351,13 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
 
     # print(reg)
 
-    K = np.exp(-M / reg)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
     # print(np.min(K))
+    tmp2 = np.empty(b.shape, dtype=M.dtype)
 
     Kp = (1 / a).reshape(-1, 1) * K
     cpt = 0
@@ -353,6 +365,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     while (err > stopThr and cpt < numItermax):
         uprev = u
         vprev = v
+
         KtransposeU = np.dot(K.T, u)
         v = np.divide(b, KtransposeU)
         u = 1. / np.dot(Kp, v)
@@ -373,8 +386,9 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
                 err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
                     np.sum((v - vprev)**2) / np.sum((v)**2)
             else:
-                transp = u.reshape(-1, 1) * (K * v)
-                err = np.linalg.norm((np.sum(transp, axis=0) - b))**2
+                # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
+                np.einsum('i,ij,j->j', u, K, v, out=tmp2)
+                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
             if log:
                 log['err'].append(err)
 
@@ -389,10 +403,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
         log['v'] = v
 
     if nbb:  # return only loss
-        res = np.zeros((nbb))
-        for i in range(nbb):
-            res[i] = np.sum(
-                u[:, i].reshape((-1, 1)) * K * v[:, i].reshape((1, -1)) * M)
+        res = np.einsum('ik,ij,jk,ij->k', u, K, v, M)
         if log:
             return res, log
         else:
@@ -406,6 +417,148 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
             return u.reshape((-1, 1)) * K * v.reshape((1, -1))
 
 
+def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=False):
+    """
+    Solve the entropic regularization optimal transport problem and return the OT matrix
+
+    The algorithm used is based on the paper
+
+    Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration
+        by Jason Altschuler, Jonathan Weed, Philippe Rigollet
+        appeared at NIPS 2017
+
+    which is a stochastic version of the Sinkhorn-Knopp algorithm [2].
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+
+        s.t. \gamma 1 = a
+
+             \gamma^T 1= b
+
+             \gamma\geq 0
+    where :
+
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Regularization term >0
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> import ot
+    >>> a=[.5,.5]
+    >>> b=[.5,.5]
+    >>> M=[[0.,1.],[1.,0.]]
+    >>> ot.bregman.greenkhorn(a,b,M,1)
+    array([[ 0.36552929,  0.13447071],
+           [ 0.13447071,  0.36552929]])
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+       [22] J. Altschuler, J.Weed, P. Rigollet : Near-linear time approximation algorithms for optimal transport via Sinkhorn iteration, Advances in Neural Information Processing Systems (NIPS) 31, 2017
+
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    n = a.shape[0]
+    m = b.shape[0]
+
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty_like(M)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
+
+    u = np.full(n, 1. / n)
+    v = np.full(m, 1. / m)
+    G = u[:, np.newaxis] * K * v[np.newaxis, :]
+
+    viol = G.sum(1) - a
+    viol_2 = G.sum(0) - b
+    stopThr_val = 1
+    if log:
+        log['u'] = u
+        log['v'] = v
+
+    for i in range(numItermax):
+        i_1 = np.argmax(np.abs(viol))
+        i_2 = np.argmax(np.abs(viol_2))
+        m_viol_1 = np.abs(viol[i_1])
+        m_viol_2 = np.abs(viol_2[i_2])
+        stopThr_val = np.maximum(m_viol_1, m_viol_2)
+
+        if m_viol_1 > m_viol_2:
+            old_u = u[i_1]
+            u[i_1] = a[i_1] / (K[i_1, :].dot(v))
+            G[i_1, :] = u[i_1] * K[i_1, :] * v
+
+            viol[i_1] = u[i_1] * K[i_1, :].dot(v) - a[i_1]
+            viol_2 += (K[i_1, :].T * (u[i_1] - old_u) * v)
+
+        else:
+            old_v = v[i_2]
+            v[i_2] = b[i_2] / (K[:, i_2].T.dot(u))
+            G[:, i_2] = u * K[:, i_2] * v[i_2]
+            #aviol = (G@one_m - a)
+            #aviol_2 = (G.T@one_n - b)
+            viol += (-old_v + v[i_2]) * K[:, i_2] * u
+            viol_2[i_2] = v[i_2] * K[:, i_2].dot(u) - b[i_2]
+
+            #print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2)))
+
+        if stopThr_val <= stopThr:
+            break
+    else:
+        print('Warning: Algorithm did not converge')
+
+    if log:
+        log['u'] = u
+        log['v'] = v
+
+    if log:
+        return G, log
+    else:
+        return G
+
+
 def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
                         warmstart=None, verbose=False, print_period=20, log=False, **kwargs):
     """
@@ -915,6 +1068,116 @@ def barycenter(A, M, reg, weights=None, numItermax=1000,
         return geometricBar(weights, UKv)
 
 
+def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1e-9, stabThr=1e-30, verbose=False, log=False):
+    """Compute the entropic regularized wasserstein barycenter of distributions A
+    where A is a collection of 2D images.
+
+     The function solves the following optimization problem:
+
+    .. math::
+       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+
+    where :
+
+    - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn)
+    - :math:`\mathbf{a}_i` are training distributions (2D images) in the mast two dimensions of matrix :math:`\mathbf{A}`
+    - reg is the regularization strength scalar value
+
+    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [21]_
+
+    Parameters
+    ----------
+    A : np.ndarray (n,w,h)
+        n distributions (2D images) of size w x h
+    reg : float
+        Regularization term >0
+    weights : np.ndarray (n,)
+        Weights of each image on the simplex (barycentric coodinates)
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    stabThr : float, optional
+        Stabilization threshold to avoid numerical precision issue
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    a : (w,h) ndarray
+        2D Wasserstein barycenter
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [21] Solomon, J., De Goes, F., Peyré, G., Cuturi, M., Butscher, A., Nguyen, A. & Guibas, L. (2015).
+    Convolutional wasserstein distances: Efficient optimal transportation on geometric domains
+    ACM Transactions on Graphics (TOG), 34(4), 66
+
+
+    """
+
+    if weights is None:
+        weights = np.ones(A.shape[0]) / A.shape[0]
+    else:
+        assert(len(weights) == A.shape[0])
+
+    if log:
+        log = {'err': []}
+
+    b = np.zeros_like(A[0, :, :])
+    U = np.ones_like(A)
+    KV = np.ones_like(A)
+
+    cpt = 0
+    err = 1
+
+    # build the convolution operator
+    t = np.linspace(0, 1, A.shape[1])
+    [Y, X] = np.meshgrid(t, t)
+    xi1 = np.exp(-(X - Y)**2 / reg)
+
+    def K(x):
+        return np.dot(np.dot(xi1, x), xi1)
+
+    while (err > stopThr and cpt < numItermax):
+
+        bold = b
+        cpt = cpt + 1
+
+        b = np.zeros_like(A[0, :, :])
+        for r in range(A.shape[0]):
+            KV[r, :, :] = K(A[r, :, :] / np.maximum(stabThr, K(U[r, :, :])))
+            b += weights[r] * np.log(np.maximum(stabThr, U[r, :, :] * KV[r, :, :]))
+        b = np.exp(b)
+        for r in range(A.shape[0]):
+            U[r, :, :] = b / np.maximum(stabThr, KV[r, :, :])
+
+        if cpt % 10 == 1:
+            err = np.sum(np.abs(bold - b))
+            # log and verbose print
+            if log:
+                log['err'].append(err)
+
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, err))
+
+    if log:
+        log['niter'] = cpt
+        log['U'] = U
+        return b, log
+    else:
+        return b
+
+
 def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
           stopThr=1e-3, verbose=False, log=False):
     """
diff --git a/ot/da.py b/ot/da.py
index 48b418f..bc09e3c 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -15,7 +15,7 @@ import scipy.linalg as linalg
 from .bregman import sinkhorn
 from .lp import emd
 from .utils import unif, dist, kernel, cost_normalization
-from .utils import check_params, deprecated, BaseEstimator
+from .utils import check_params, BaseEstimator
 from .optim import cg
 from .optim import gcg
 
@@ -740,288 +740,6 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None,
         return A, b
 
 
-@deprecated("The class OTDA is deprecated in 0.3.1 and will be "
-            "removed in 0.5"
-            "\n\tfor standard transport use class EMDTransport instead.")
-class OTDA(object):
-
-    """Class for domain adaptation with optimal transport as proposed in [5]
-
-
-    References
-    ----------
-
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
-       "Optimal Transport for Domain Adaptation," in IEEE Transactions on
-       Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-
-    """
-
-    def __init__(self, metric='sqeuclidean', norm=None):
-        """ Class initialization"""
-        self.xs = 0
-        self.xt = 0
-        self.G = 0
-        self.metric = metric
-        self.norm = norm
-        self.computed = False
-
-    def fit(self, xs, xt, ws=None, wt=None, max_iter=100000):
-        """Fit domain adaptation between samples is xs and xt
-        (with optional weights)"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = emd(ws, wt, self.M, max_iter)
-        self.computed = True
-
-    def interp(self, direction=1):
-        """Barycentric interpolation for the source (1) or target (-1) samples
-
-        This Barycentric interpolation solves for each source (resp target)
-        sample xs (resp xt) the following optimization problem:
-
-        .. math::
-            arg\min_x \sum_i \gamma_{k,i} c(x,x_i^t)
-
-        where k is the index of the sample in xs
-
-        For the moment only squared euclidean distance is provided but more
-        metric  could be used in the future.
-
-        """
-        if direction > 0:  # >0 then source to target
-            G = self.G
-            w = self.ws.reshape((self.xs.shape[0], 1))
-            x = self.xt
-        else:
-            G = self.G.T
-            w = self.wt.reshape((self.xt.shape[0], 1))
-            x = self.xs
-
-        if self.computed:
-            if self.metric == 'sqeuclidean':
-                return np.dot(G / w, x)  # weighted mean
-            else:
-                print(
-                    "Warning, metric not handled yet, using weighted average")
-                return np.dot(G / w, x)  # weighted mean
-                return None
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-    def predict(self, x, direction=1):
-        """ Out of sample mapping using the formulation from [6]
-
-        For each sample x to map, it finds the nearest source sample xs and
-        map the samle x to the position xst+(x-xs) wher xst is the barycentric
-        interpolation of source sample xs.
-
-        References
-        ----------
-
-        .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014).
-          Regularized discrete optimal transport. SIAM Journal on Imaging
-          Sciences, 7(3), 1853-1882.
-
-        """
-        if direction > 0:  # >0 then source to target
-            xf = self.xt
-            x0 = self.xs
-        else:
-            xf = self.xs
-            x0 = self.xt
-
-        D0 = dist(x, x0)  # dist netween new samples an source
-        idx = np.argmin(D0, 1)  # closest one
-        xf = self.interp(direction)  # interp the source samples
-        # aply the delta to the interpolation
-        return xf[idx, :] + x - x0[idx, :]
-
-
-@deprecated("The class OTDA_sinkhorn is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornTransport instead.")
-class OTDA_sinkhorn(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic
-    regularization
-
-
-    """
-
-    def fit(self, xs, xt, reg=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-        (with optional weights)"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn(ws, wt, self.M, reg, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_lpl1 is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornLpl1Transport instead.")
-class OTDA_lpl1(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic and
-    group regularization"""
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-        (with optional weights),  See ot.da.sinkhorn_lpl1_mm for fit
-        parameters"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn_lpl1_mm(ws, ys, wt, self.M, reg, eta, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_l1L2 is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class SinkhornL1l2Transport instead.")
-class OTDA_l1l2(OTDA):
-
-    """Class for domain adaptation with optimal transport with entropic
-    and group lasso regularization"""
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, **kwargs):
-        """Fit regularized domain adaptation between samples is xs and xt
-           (with optional weights),  See ot.da.sinkhorn_lpl1_gl for fit
-           parameters"""
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M = dist(xs, xt, metric=self.metric)
-        self.M = cost_normalization(self.M, self.norm)
-        self.G = sinkhorn_l1l2_gl(ws, ys, wt, self.M, reg, eta, **kwargs)
-        self.computed = True
-
-
-@deprecated("The class OTDA_mapping_linear is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class MappingTransport instead.")
-class OTDA_mapping_linear(OTDA):
-
-    """Class for optimal transport with joint linear mapping estimation as in
-    [8]
-    """
-
-    def __init__(self):
-        """ Class initialization"""
-
-        self.xs = 0
-        self.xt = 0
-        self.G = 0
-        self.L = 0
-        self.bias = False
-        self.computed = False
-        self.metric = 'sqeuclidean'
-
-    def fit(self, xs, xt, mu=1, eta=1, bias=False, **kwargs):
-        """ Fit domain adaptation between samples is xs and xt (with optional
-            weights)"""
-        self.xs = xs
-        self.xt = xt
-        self.bias = bias
-
-        self.ws = unif(xs.shape[0])
-        self.wt = unif(xt.shape[0])
-
-        self.G, self.L = joint_OT_mapping_linear(
-            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
-        self.computed = True
-
-    def mapping(self):
-        return lambda x: self.predict(x)
-
-    def predict(self, x):
-        """ Out of sample mapping estimated during the call to fit"""
-        if self.computed:
-            if self.bias:
-                x = np.hstack((x, np.ones((x.shape[0], 1))))
-            return x.dot(self.L)  # aply the delta to the interpolation
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-
-@deprecated("The class OTDA_mapping_kernel is deprecated in 0.3.1 and will be"
-            " removed in 0.5 \nUse class MappingTransport instead.")
-class OTDA_mapping_kernel(OTDA_mapping_linear):
-
-    """Class for optimal transport with joint nonlinear mapping
-    estimation as in [8]"""
-
-    def fit(self, xs, xt, mu=1, eta=1, bias=False, kerneltype='gaussian',
-            sigma=1, **kwargs):
-        """ Fit domain adaptation between samples is xs and xt """
-        self.xs = xs
-        self.xt = xt
-        self.bias = bias
-
-        self.ws = unif(xs.shape[0])
-        self.wt = unif(xt.shape[0])
-        self.kernel = kerneltype
-        self.sigma = sigma
-        self.kwargs = kwargs
-
-        self.G, self.L = joint_OT_mapping_kernel(
-            xs, xt, mu=mu, eta=eta, bias=bias, **kwargs)
-        self.computed = True
-
-    def predict(self, x):
-        """ Out of sample mapping estimated during the call to fit"""
-
-        if self.computed:
-            K = kernel(
-                x, self.xs, method=self.kernel, sigma=self.sigma,
-                **self.kwargs)
-            if self.bias:
-                K = np.hstack((K, np.ones((x.shape[0], 1))))
-            return K.dot(self.L)
-        else:
-            print("Warning, model not fitted yet, returning None")
-            return None
-
-
 def distribution_estimation_uniform(X):
     """estimates a uniform distribution from an array of samples X
 
diff --git a/ot/datasets.py b/ot/datasets.py
index 362a89b..e76e75d 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -38,9 +38,9 @@ def make_1D_gauss(n, m, s):
 
 
 @deprecated()
-def get_1D_gauss(n, m, sigma, random_state=None):
+def get_1D_gauss(n, m, sigma):
     """ Deprecated see  make_1D_gauss   """
-    return make_1D_gauss(n, m, sigma, random_state=None)
+    return make_1D_gauss(n, m, sigma)
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py
index a2fdd3d..deda6b1 100644
--- a/ot/gpu/__init__.py
+++ b/ot/gpu/__init__.py
@@ -1,12 +1,37 @@
 # -*- coding: utf-8 -*-
+"""
 
-from . import bregman
-from . import da
-from .bregman import sinkhorn
+This module provides GPU implementation for several OT solvers and utility 
+functions. The GPU backend in handled by `cupy 
+<https://cupy.chainer.org/>`_.
+
+By default, the functions in this module accept and return numpy arrays 
+in order to proide drop-in replacement for the other POT function but
+the transfer between CPU en GPU comes with a significant overhead.
+
+In order to get the best erformances, we recommend to give only cupy 
+arrays to the functions and desactivate the conversion to numpy of the 
+result of the function with parameter ``to_numpy=False``.
+
+"""
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
 #         Leo Gautheron <https://github.com/aje>
 #
 # License: MIT License
 
-__all__ = ["bregman", "da", "sinkhorn"]
+from . import bregman
+from . import da
+from .bregman import sinkhorn
+from .da import sinkhorn_lpl1_mm
+
+from . import utils
+from .utils import dist, to_gpu, to_np
+
+
+
+
+
+__all__ = ["utils", "dist", "sinkhorn",
+           "sinkhorn_lpl1_mm", 'bregman', 'da', 'to_gpu', 'to_np']
+
diff --git a/ot/gpu/bregman.py b/ot/gpu/bregman.py
index 47939c4..978b307 100644
--- a/ot/gpu/bregman.py
+++ b/ot/gpu/bregman.py
@@ -8,14 +8,18 @@ Bregman projections for regularized OT with GPU
 #
 # License: MIT License
 
-import numpy as np
-import cudamat
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+from . import utils
 
 
-def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
-                log=False, returnAsGPU=False):
-    r"""
-    Solve the entropic regularization optimal transport problem on GPU
+def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
+                   verbose=False, log=False, to_numpy=True, **kwargs):
+    """
+    Solve the entropic regularization optimal transport on GPU
+
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
 
     The function solves the following optimization problem:
 
@@ -40,9 +44,10 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     ----------
     a : np.ndarray (ns,)
         samples weights in the source domain
-    b : np.ndarray (nt,)
-        samples in the target domain
-    M_GPU : cudamat.CUDAMatrix (ns,nt)
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term >0
@@ -54,8 +59,9 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
         Print information along iterations
     log : bool, optional
         record log if True
-    returnAsGPU : bool, optional
-        return the OT matrix as a cudamat.CUDAMatrix
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
+
 
     Returns
     -------
@@ -88,60 +94,78 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
     ot.optim.cg : General regularized OT
 
     """
+
+    a = cp.asarray(a)
+    b = cp.asarray(b)
+    M = cp.asarray(M)
+
+    if len(a) == 0:
+        a = np.ones((M.shape[0],)) / M.shape[0]
+    if len(b) == 0:
+        b = np.ones((M.shape[1],)) / M.shape[1]
+
     # init data
     Nini = len(a)
     Nfin = len(b)
 
+    if len(b.shape) > 1:
+        nbb = b.shape[1]
+    else:
+        nbb = 0
+
     if log:
         log = {'err': []}
 
     # we assume that no distances are null except those of the diagonal of
     # distances
-    u = (np.ones(Nini) / Nini).reshape((Nini, 1))
-    u_GPU = cudamat.CUDAMatrix(u)
-    a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
-    ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
-    v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
-    v_GPU = cudamat.CUDAMatrix(v)
-    b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))
-
-    M_GPU.divide(-reg)
+    if nbb:
+        u = np.ones((Nini, nbb)) / Nini
+        v = np.ones((Nfin, nbb)) / Nfin
+    else:
+        u = np.ones(Nini) / Nini
+        v = np.ones(Nfin) / Nfin
 
-    K_GPU = cudamat.exp(M_GPU)
+    # print(reg)
 
-    ones_GPU.divide(a_GPU, target=a_GPU)
-    Kp_GPU = cudamat.empty(K_GPU.shape)
-    K_GPU.mult_by_col(a_GPU, target=Kp_GPU)
+    # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
+    K = np.empty(M.shape, dtype=M.dtype)
+    np.divide(M, -reg, out=K)
+    np.exp(K, out=K)
 
-    tmp_GPU = cudamat.empty(K_GPU.shape)
+    # print(np.min(K))
+    tmp2 = np.empty(b.shape, dtype=M.dtype)
 
+    Kp = (1 / a).reshape(-1, 1) * K
     cpt = 0
     err = 1
     while (err > stopThr and cpt < numItermax):
-        uprev_GPU = u_GPU.copy()
-        vprev_GPU = v_GPU.copy()
+        uprev = u
+        vprev = v
 
-        KtransposeU_GPU = K_GPU.transpose().dot(u_GPU)
-        b_GPU.divide(KtransposeU_GPU, target=v_GPU)
-        ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)
+        KtransposeU = np.dot(K.T, u)
+        v = np.divide(b, KtransposeU)
+        u = 1. / np.dot(Kp, v)
 
-        if (np.any(KtransposeU_GPU.asarray() == 0) or
-                not u_GPU.allfinite() or not v_GPU.allfinite()):
+        if (np.any(KtransposeU == 0) or
+                np.any(np.isnan(u)) or np.any(np.isnan(v)) or
+                np.any(np.isinf(u)) or np.any(np.isinf(v))):
             # we have reached the machine precision
             # come back to previous solution and quit loop
             print('Warning: numerical errors at iteration', cpt)
-            u_GPU = uprev_GPU.copy()
-            v_GPU = vprev_GPU.copy()
+            u = uprev
+            v = vprev
             break
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            K_GPU.mult_by_col(u_GPU, target=tmp_GPU)
-            tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU)
-
-            bcopy_GPU = b_GPU.copy().transpose()
-            bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1)
-            err = bcopy_GPU.euclid_norm()**2
+            if nbb:
+                err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
+                    np.sum((v - vprev)**2) / np.sum((v)**2)
+            else:
+                # compute right marginal tmp2= (diag(u)Kdiag(v))^T1
+                tmp2 = np.sum(u[:, None] * K * v[None, :], 0)
+                #tmp2=np.einsum('i,ij,j->j', u, K, v)
+                err = np.linalg.norm(tmp2 - b)**2  # violation of marginal
             if log:
                 log['err'].append(err)
 
@@ -150,20 +174,32 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
                     print(
                         '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                 print('{:5d}|{:8e}|'.format(cpt, err))
-        cpt += 1
-    if log:
-        log['u'] = u_GPU.asarray()
-        log['v'] = v_GPU.asarray()
-
-    K_GPU.mult_by_col(u_GPU, target=K_GPU)
-    K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU)
-
-    if returnAsGPU:
-        res = K_GPU
-    else:
-        res = K_GPU.asarray()
-
+        cpt = cpt + 1
     if log:
-        return res, log
-    else:
-        return res
+        log['u'] = u
+        log['v'] = v
+
+    if nbb:  # return only loss
+        #res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) (explodes cupy memory)
+        res = np.empty(nbb)
+        for i in range(nbb):
+            res[i] = np.sum(u[:, None, i] * (K * M) * v[None, :, i])
+        if to_numpy:
+            res = utils.to_np(res)
+        if log:
+            return res, log
+        else:
+            return res
+
+    else:  # return OT matrix
+        res = u.reshape((-1, 1)) * K * v.reshape((1, -1))
+        if to_numpy:
+            res = utils.to_np(res)
+        if log:
+            return res, log
+        else:
+            return res
+
+
+# define sinkhorn as sinkhorn_knopp
+sinkhorn = sinkhorn_knopp
diff --git a/ot/gpu/da.py b/ot/gpu/da.py
index 71a485a..4a98038 100644
--- a/ot/gpu/da.py
+++ b/ot/gpu/da.py
@@ -11,80 +11,30 @@ Domain adaptation with optimal transport with GPU implementation
 # License: MIT License
 
 
-import numpy as np
-from ..utils import unif
-from ..da import OTDA
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+import numpy as npp
+from . import utils
+
 from .bregman import sinkhorn
-import cudamat
 
 
-def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False):
+def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10,
+                     numInnerItermax=200, stopInnerThr=1e-9, verbose=False,
+                     log=False, to_numpy=True):
     """
-    Compute the pairwise euclidean distance between matrices a and b.
-
-
-    Parameters
-    ----------
-    a : np.ndarray (n, f)
-        first matrice
-    b : np.ndarray (m, f)
-        second matrice
-    returnAsGPU : boolean, optional (default False)
-        if True, returns cudamat matrix still on GPU, else return np.ndarray
-    squared : boolean, optional (default False)
-        if True, return squared euclidean distance matrice
+    Solve the entropic regularization optimal transport problem with nonconvex
+    group lasso regularization on GPU
 
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
 
-    Returns
-    -------
-    c : (n x m) np.ndarray or cudamat.CUDAMatrix
-        pairwise euclidean distance distance matrix
-    """
-    # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m).
-    # First compute in c_GPU the squared euclidean distance. And return its
-    # square root. At each cell [i,j] of c, we want to have
-    # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that
-    # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c:
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2).
-
-    a_GPU = cudamat.CUDAMatrix(a)
-    b_GPU = cudamat.CUDAMatrix(b)
-
-    # Multiply a by b transpose to obtain in each cell [i,j] of c the
-    # value sum{k in range(f)} ( a[i,k]b[j,k] )
-    c_GPU = cudamat.dot(a_GPU, b_GPU.transpose())
-    # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] )
-    c_GPU.mult(-2)
-
-    # Compute the vectors of the sum of squared elements.
-    a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1)
-    b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1)
-
-    # Add the vectors in each columns (respectivly rows) of c.
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] )
-    c_GPU.add_col_vec(a_GPU)
-    # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2)
-    c_GPU.add_row_vec(b_GPU.transpose())
-
-    if not squared:
-        c_GPU = cudamat.sqrt(c_GPU)
-
-    if returnAsGPU:
-        return c_GPU
-    else:
-        return c_GPU.asarray()
-
-
-def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
-                     numInnerItermax=200, stopInnerThr=1e-9,
-                     verbose=False, log=False):
-    """
-    Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization
 
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma)
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)
+        + \eta \Omega_g(\gamma)
 
         s.t. \gamma 1 = a
 
@@ -94,11 +44,16 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     where :
 
     - M is the (ns,nt) metric cost matrix
-    - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-    - :math:`\Omega_g` is the group lasso  regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`   where  :math:`\mathcal{I}_c` are the index of samples from class c in the source domain.
+    - :math:`\Omega_e` is the entropic regularization term
+        :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - :math:`\Omega_g` is the group lasso  regulaization term
+      :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1`
+      where  :math:`\mathcal{I}_c` are the index of samples from class c
+      in the source domain.
     - a and b are source and target weights (sum to 1)
 
-    The algorithm used for solving the problem is the generalised conditional gradient as proposed in  [5]_ [7]_
+    The algorithm used for solving the problem is the generalised conditional
+    gradient as proposed in  [5]_ [7]_
 
 
     Parameters
@@ -109,7 +64,7 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
         labels of samples in the source domain
     b : np.ndarray (nt,)
         samples weights in the target domain
-    M_GPU : cudamat.CUDAMatrix (ns,nt)
+    M : np.ndarray (ns,nt)
         loss matrix
     reg : float
         Regularization term for entropic regularization >0
@@ -125,6 +80,8 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
         Print information along iterations
     log : bool, optional
         record log if True
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
 
 
     Returns
@@ -138,8 +95,13 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     References
     ----------
 
-    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1
-    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567.
+    .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy,
+       "Optimal Transport for Domain Adaptation," in IEEE
+       Transactions on Pattern Analysis and Machine Intelligence ,
+       vol.PP, no.99, pp.1-1
+    .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015).
+       Generalized conditional gradient: analysis of convergence
+       and applications. arXiv preprint arXiv:1510.06567.
 
     See Also
     --------
@@ -148,108 +110,35 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10,
     ot.optim.cg : General regularized OT
 
     """
+
+    a, labels_a, b, M = utils.to_gpu(a, labels_a, b, M)
+
     p = 0.5
     epsilon = 1e-3
-    Nfin = len(b)
 
     indices_labels = []
-    classes = np.unique(labels_a)
+    labels_a2 = cp.asnumpy(labels_a)
+    classes = npp.unique(labels_a2)
     for c in classes:
-        idxc, = np.where(labels_a == c)
-        indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1)))
+        idxc, = utils.to_gpu(npp.where(labels_a2 == c))
+        indices_labels.append(idxc)
 
-    Mreg_GPU = cudamat.empty(M_GPU.shape)
-    W_GPU = cudamat.empty(M_GPU.shape).assign(0)
+    W = np.zeros(M.shape)
 
     for cpt in range(numItermax):
-        Mreg_GPU.assign(M_GPU)
-        Mreg_GPU.add_mult(W_GPU, eta)
-        transp_GPU = sinkhorn(a, b, Mreg_GPU, reg, numItermax=numInnerItermax,
-                              stopThr=stopInnerThr, returnAsGPU=True)
+        Mreg = M + eta * W
+        transp = sinkhorn(a, b, Mreg, reg, numItermax=numInnerItermax,
+                          stopThr=stopInnerThr, to_numpy=False)
         # the transport has been computed. Check if classes are really
         # separated
-        W_GPU.assign(1)
-        W_GPU = W_GPU.transpose()
+        W = np.ones(M.shape)
         for (i, c) in enumerate(classes):
-            (_, nbRow) = indices_labels[i].shape
-            tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0)
-            transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU)
-            majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon)
-            cudamat.pow(majs_GPU, (p - 1))
-            majs_GPU.mult(p)
-
-            tmpC_GPU.assign(0)
-            tmpC_GPU.add_col_vec(majs_GPU)
-            W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU)
-
-        W_GPU = W_GPU.transpose()
-
-    return transp_GPU.asarray()
 
+            majs = np.sum(transp[indices_labels[i]], axis=0)
+            majs = p * ((majs + epsilon)**(p - 1))
+            W[indices_labels[i]] = majs
 
-class OTDA_GPU(OTDA):
-
-    def normalizeM(self, norm):
-        if norm == "median":
-            self.M_GPU.divide(float(np.median(self.M_GPU.asarray())))
-        elif norm == "max":
-            self.M_GPU.divide(float(np.max(self.M_GPU.asarray())))
-        elif norm == "log":
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-        elif norm == "loglog":
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-            self.M_GPU.add(1)
-            cudamat.log(self.M_GPU)
-
-
-class OTDA_sinkhorn(OTDA_GPU):
-
-    def fit(self, xs, xt, reg=1, ws=None, wt=None, norm=None, **kwargs):
-        cudamat.init()
-        xs = np.asarray(xs, dtype=np.float64)
-        xt = np.asarray(xt, dtype=np.float64)
-
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M_GPU = pairwiseEuclideanGPU(xs, xt, returnAsGPU=True,
-                                          squared=True)
-        self.normalizeM(norm)
-        self.G = sinkhorn(ws, wt, self.M_GPU, reg, **kwargs)
-        self.computed = True
-
-
-class OTDA_lpl1(OTDA_GPU):
-
-    def fit(self, xs, ys, xt, reg=1, eta=1, ws=None, wt=None, norm=None,
-            **kwargs):
-        cudamat.init()
-        xs = np.asarray(xs, dtype=np.float64)
-        xt = np.asarray(xt, dtype=np.float64)
-
-        self.xs = xs
-        self.xt = xt
-
-        if wt is None:
-            wt = unif(xt.shape[0])
-        if ws is None:
-            ws = unif(xs.shape[0])
-
-        self.ws = ws
-        self.wt = wt
-
-        self.M_GPU = pairwiseEuclideanGPU(xs, xt, returnAsGPU=True,
-                                          squared=True)
-        self.normalizeM(norm)
-        self.G = sinkhorn_lpl1_mm(ws, ys, wt, self.M_GPU, reg, eta, **kwargs)
-        self.computed = True
+    if to_numpy:
+        return utils.to_np(transp)
+    else:
+        return transp
diff --git a/ot/gpu/utils.py b/ot/gpu/utils.py
new file mode 100644
index 0000000..41e168a
--- /dev/null
+++ b/ot/gpu/utils.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+"""
+Utility functions for GPU
+"""
+
+# Author: Remi Flamary <remi.flamary@unice.fr>
+#         Nicolas Courty <ncourty@irisa.fr>
+#         Leo Gautheron <https://github.com/aje>
+#
+# License: MIT License
+
+import cupy as np  # np used for matrix computation
+import cupy as cp  # cp used for cupy specific operations
+
+
+def euclidean_distances(a, b, squared=False, to_numpy=True):
+    """
+    Compute the pairwise euclidean distance between matrices a and b.
+
+    If the input matrix are in numpy format, they will be uploaded to the
+    GPU first which can incur significant time overhead.
+
+    Parameters
+    ----------
+    a : np.ndarray (n, f)
+        first matrix
+    b : np.ndarray (m, f)
+        second matrix
+    to_numpy : boolean, optional (default True)
+        If true convert back the GPU array result to numpy format.
+    squared : boolean, optional (default False)
+        if True, return squared euclidean distance matrix
+
+    Returns
+    -------
+    c : (n x m) np.ndarray or cupy.ndarray
+        pairwise euclidean distance distance matrix
+    """
+
+    a, b = to_gpu(a, b)
+
+    a2 = np.sum(np.square(a), 1)
+    b2 = np.sum(np.square(b), 1)
+
+    c = -2 * np.dot(a, b.T)
+    c += a2[:, None]
+    c += b2[None, :]
+
+    if not squared:
+        np.sqrt(c, out=c)
+    if to_numpy:
+        return to_np(c)
+    else:
+        return c
+
+
+def dist(x1, x2=None, metric='sqeuclidean', to_numpy=True):
+    """Compute distance between samples in x1 and x2 on gpu
+
+    Parameters
+    ----------
+
+    x1 : np.array (n1,d)
+        matrix with n1 samples of size d
+    x2 : np.array (n2,d), optional
+        matrix with n2 samples of size d (if None then x2=x1)
+    metric : str
+        Metric from 'sqeuclidean', 'euclidean',
+
+
+    Returns
+    -------
+
+    M : np.array (n1,n2)
+        distance matrix computed with given metric
+
+    """
+    if x2 is None:
+        x2 = x1
+    if metric == "sqeuclidean":
+        return euclidean_distances(x1, x2, squared=True, to_numpy=to_numpy)
+    elif metric == "euclidean":
+        return euclidean_distances(x1, x2, squared=False, to_numpy=to_numpy)
+    else:
+        raise NotImplementedError
+
+
+def to_gpu(*args):
+    """ Upload numpy arrays to GPU and return them"""
+    if len(args) > 1:
+        return (cp.asarray(x) for x in args)
+    else:
+        return cp.asarray(args[0])
+
+
+def to_np(*args):
+    """ convert GPU arras to numpy and return them"""
+    if len(args) > 1:
+        return (cp.asnumpy(x) for x in args)
+    else:
+        return cp.asnumpy(args[0])
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 5dda82a..02cbd8c 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -17,6 +17,9 @@ from .import cvx
 from .emd_wrap import emd_c, check_result
 from ..utils import parmap
 from .cvx import barycenter
+from ..utils import dist
+
+__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
 
 
 def emd(a, b, M, numItermax=100000, log=False):
@@ -214,3 +217,95 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
 
     res = parmap(f, [b[:, i] for i in range(nb)], processes)
     return res
+
+
+
+def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None):
+    """
+    Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance)
+
+    The function solves the Wasserstein barycenter problem when the barycenter measure is constrained to be supported on k atoms.
+    This problem is considered in [1] (Algorithm 2). There are two differences with the following codes:
+    - we do not optimize over the weights
+    - we do not do line search for the locations updates, we use i.e. theta = 1 in [1] (Algorithm 2). This can be seen as a discrete implementation of the fixed-point algorithm of [2] proposed in the continuous setting.
+
+    Parameters
+    ----------
+    measures_locations : list of (k_i,d) np.ndarray
+        The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
+    measures_weights : list of (k_i,) np.ndarray
+        Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
+
+    X_init : (k,d) np.ndarray
+        Initialization of the support locations (on k atoms) of the barycenter
+    b : (k,) np.ndarray
+        Initialization of the weights of the barycenter (non-negatives, sum to 1)
+    weights : (k,) np.ndarray
+        Initialization of the coefficients of the barycenter (non-negatives, sum to 1)
+
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+    X : (k,d) np.ndarray
+        Support locations (on k atoms) of the barycenter
+
+    References
+    ----------
+
+    .. [1] Cuturi, Marco, and Arnaud Doucet. "Fast computation of Wasserstein barycenters." International Conference on Machine Learning. 2014.
+
+    .. [2]  Álvarez-Esteban, Pedro C., et al. "A fixed-point approach to barycenters in Wasserstein space." Journal of Mathematical Analysis and Applications 441.2 (2016): 744-762.
+
+    """
+
+    iter_count = 0
+
+    N = len(measures_locations)
+    k = X_init.shape[0]
+    d = X_init.shape[1]
+    if b is None:
+        b = np.ones((k,))/k
+    if weights is None:
+        weights = np.ones((N,)) / N
+
+    X = X_init
+
+    log_dict = {}
+    displacement_square_norms = []
+
+    displacement_square_norm = stopThr + 1.
+
+    while ( displacement_square_norm > stopThr and iter_count < numItermax ):
+
+        T_sum = np.zeros((k, d))
+
+        for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, weights.tolist()):
+
+            M_i = dist(X, measure_locations_i)
+            T_i = emd(b, measure_weights_i, M_i)
+            T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
+
+        displacement_square_norm = np.sum(np.square(T_sum-X))
+        if log:
+            displacement_square_norms.append(displacement_square_norm)
+
+        X = T_sum
+
+        if verbose:
+            print('iteration %d, displacement_square_norm=%f\n', iter_count, displacement_square_norm)
+
+        iter_count += 1
+
+    if log:
+        log_dict['displacement_square_norms'] = displacement_square_norms
+        return X, log_dict
+    else:
+        return X
+\ No newline at end of file
diff --git a/ot/lp/cvx.py b/ot/lp/cvx.py
index c8c75bc..8e763be 100644
--- a/ot/lp/cvx.py
+++ b/ot/lp/cvx.py
@@ -11,6 +11,7 @@ import numpy as np
 import scipy as sp
 import scipy.sparse as sps
 
+
 try:
     import cvxopt
     from cvxopt import solvers, matrix, spmatrix
@@ -26,7 +27,7 @@ def scipy_sparse_to_spmatrix(A):
 
 
 def barycenter(A, M, weights=None, verbose=False, log=False, solver='interior-point'):
-    """Compute the entropic regularized wasserstein barycenter of distributions A
+    """Compute the Wasserstein barycenter of distributions A
 
      The function solves the following optimization problem [16]:
 
diff --git a/ot/smooth.py b/ot/smooth.py
new file mode 100644
index 0000000..5a8e4b5
--- /dev/null
+++ b/ot/smooth.py
@@ -0,0 +1,600 @@
+#Copyright (c) 2018, Mathieu Blondel
+#All rights reserved.
+#
+#Redistribution and use in source and binary forms, with or without
+#modification, are permitted provided that the following conditions are met:
+#
+#1. Redistributions of source code must retain the above copyright notice, this
+#list of conditions and the following disclaimer.
+#
+#2. Redistributions in binary form must reproduce the above copyright notice,
+#this list of conditions and the following disclaimer in the documentation and/or
+#other materials provided with the distribution.
+#
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+#IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+#INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+#NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+#OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+#LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+#OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+#THE POSSIBILITY OF SUCH DAMAGE.
+
+# Author: Mathieu Blondel
+#         Remi Flamary <remi.flamary@unice.fr>
+
+"""
+Implementation of
+Smooth and Sparse Optimal Transport.
+Mathieu Blondel, Vivien Seguy, Antoine Rolet.
+In Proc. of AISTATS 2018.
+https://arxiv.org/abs/1710.06276
+
+[17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal
+Transport. Proceedings of the Twenty-First International Conference on
+Artificial Intelligence and Statistics (AISTATS).
+
+Original code from https://github.com/mblondel/smooth-ot/
+
+"""
+
+import numpy as np
+from scipy.optimize import minimize
+
+
+def projection_simplex(V, z=1, axis=None):
+    """ Projection of x onto the simplex, scaled by z
+
+        P(x; z) = argmin_{y >= 0, sum(y) = z} ||y - x||^2
+    z: float or array
+        If array, len(z) must be compatible with V
+    axis: None or int
+        - axis=None: project V by P(V.ravel(); z)
+        - axis=1: project each V[i] by P(V[i]; z[i])
+        - axis=0: project each V[:, j] by P(V[:, j]; z[j])
+    """
+    if axis == 1:
+        n_features = V.shape[1]
+        U = np.sort(V, axis=1)[:, ::-1]
+        z = np.ones(len(V)) * z
+        cssv = np.cumsum(U, axis=1) - z[:, np.newaxis]
+        ind = np.arange(n_features) + 1
+        cond = U - cssv / ind > 0
+        rho = np.count_nonzero(cond, axis=1)
+        theta = cssv[np.arange(len(V)), rho - 1] / rho
+        return np.maximum(V - theta[:, np.newaxis], 0)
+
+    elif axis == 0:
+        return projection_simplex(V.T, z, axis=1).T
+
+    else:
+        V = V.ravel().reshape(1, -1)
+        return projection_simplex(V, z, axis=1).ravel()
+
+
+class Regularization(object):
+    """Base class for Regularization objects
+
+        Notes
+        -----
+        This class is not intended for direct use but as aparent for true
+        regularizatiojn implementation.
+    """
+
+    def __init__(self, gamma=1.0):
+        """
+
+        Parameters
+        ----------
+        gamma: float
+            Regularization parameter.
+            We recover unregularized OT when gamma -> 0.
+
+        """
+        self.gamma = gamma
+
+    def delta_Omega(X):
+        """
+        Compute delta_Omega(X[:, j]) for each X[:, j].
+        delta_Omega(x) = sup_{y >= 0} y^T x - Omega(y).
+
+        Parameters
+        ----------
+        X: array, shape = len(a) x len(b)
+            Input array.
+
+        Returns
+        -------
+        v: array, len(b)
+            Values: v[j] = delta_Omega(X[:, j])
+        G: array, len(a) x len(b)
+            Gradients: G[:, j] = nabla delta_Omega(X[:, j])
+        """
+        raise NotImplementedError
+
+    def max_Omega(X, b):
+        """
+        Compute max_Omega_j(X[:, j]) for each X[:, j].
+        max_Omega_j(x) = sup_{y >= 0, sum(y) = 1} y^T x - Omega(b[j] y) / b[j].
+
+        Parameters
+        ----------
+        X: array, shape = len(a) x len(b)
+            Input array.
+
+        Returns
+        -------
+        v: array, len(b)
+            Values: v[j] = max_Omega_j(X[:, j])
+        G: array, len(a) x len(b)
+            Gradients: G[:, j] = nabla max_Omega_j(X[:, j])
+        """
+        raise NotImplementedError
+
+    def Omega(T):
+        """
+        Compute regularization term.
+
+        Parameters
+        ----------
+        T: array, shape = len(a) x len(b)
+            Input array.
+
+        Returns
+        -------
+        value: float
+            Regularization term.
+        """
+        raise NotImplementedError
+
+
+class NegEntropy(Regularization):
+    """ NegEntropy regularization """
+
+    def delta_Omega(self, X):
+        G = np.exp(X / self.gamma - 1)
+        val = self.gamma * np.sum(G, axis=0)
+        return val, G
+
+    def max_Omega(self, X, b):
+        max_X = np.max(X, axis=0) / self.gamma
+        exp_X = np.exp(X / self.gamma - max_X)
+        val = self.gamma * (np.log(np.sum(exp_X, axis=0)) + max_X)
+        val -= self.gamma * np.log(b)
+        G = exp_X / np.sum(exp_X, axis=0)
+        return val, G
+
+    def Omega(self, T):
+        return self.gamma * np.sum(T * np.log(T))
+
+
+class SquaredL2(Regularization):
+    """ Squared L2 regularization """
+
+    def delta_Omega(self, X):
+        max_X = np.maximum(X, 0)
+        val = np.sum(max_X ** 2, axis=0) / (2 * self.gamma)
+        G = max_X / self.gamma
+        return val, G
+
+    def max_Omega(self, X, b):
+        G = projection_simplex(X / (b * self.gamma), axis=0)
+        val = np.sum(X * G, axis=0)
+        val -= 0.5 * self.gamma * b * np.sum(G * G, axis=0)
+        return val, G
+
+    def Omega(self, T):
+        return 0.5 * self.gamma * np.sum(T ** 2)
+
+
+def dual_obj_grad(alpha, beta, a, b, C, regul):
+    """
+    Compute objective value and gradients of dual objective.
+
+    Parameters
+    ----------
+    alpha: array, shape = len(a)
+    beta: array, shape = len(b)
+        Current iterate of dual potentials.
+    a: array, shape = len(a)
+    b: array, shape = len(b)
+        Input histograms (should be non-negative and sum to 1).
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a delta_Omega(X) method.
+
+    Returns
+    -------
+    obj: float
+        Objective value (higher is better).
+    grad_alpha: array, shape = len(a)
+        Gradient w.r.t. alpha.
+    grad_beta: array, shape = len(b)
+        Gradient w.r.t. beta.
+    """
+    obj = np.dot(alpha, a) + np.dot(beta, b)
+    grad_alpha = a.copy()
+    grad_beta = b.copy()
+
+    # X[:, j] = alpha + beta[j] - C[:, j]
+    X = alpha[:, np.newaxis] + beta - C
+
+    # val.shape = len(b)
+    # G.shape = len(a) x len(b)
+    val, G = regul.delta_Omega(X)
+
+    obj -= np.sum(val)
+    grad_alpha -= G.sum(axis=1)
+    grad_beta -= G.sum(axis=0)
+
+    return obj, grad_alpha, grad_beta
+
+
+def solve_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
+               verbose=False):
+    """
+    Solve the "smoothed" dual objective.
+
+    Parameters
+    ----------
+    a: array, shape = len(a)
+    b: array, shape = len(b)
+        Input histograms (should be non-negative and sum to 1).
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a delta_Omega(X) method.
+    method: str
+        Solver to be used (passed to `scipy.optimize.minimize`).
+    tol: float
+        Tolerance parameter.
+    max_iter: int
+        Maximum number of iterations.
+
+    Returns
+    -------
+    alpha: array, shape = len(a)
+    beta: array, shape = len(b)
+        Dual potentials.
+    """
+
+    def _func(params):
+        # Unpack alpha and beta.
+        alpha = params[:len(a)]
+        beta = params[len(a):]
+
+        obj, grad_alpha, grad_beta = dual_obj_grad(alpha, beta, a, b, C, regul)
+
+        # Pack grad_alpha and grad_beta.
+        grad = np.concatenate((grad_alpha, grad_beta))
+
+        # We need to maximize the dual.
+        return -obj, -grad
+
+    # Unfortunately, `minimize` only supports functions whose argument is a
+    # vector. So, we need to concatenate alpha and beta.
+    alpha_init = np.zeros(len(a))
+    beta_init = np.zeros(len(b))
+    params_init = np.concatenate((alpha_init, beta_init))
+
+    res = minimize(_func, params_init, method=method, jac=True,
+                   tol=tol, options=dict(maxiter=max_iter, disp=verbose))
+
+    alpha = res.x[:len(a)]
+    beta = res.x[len(a):]
+
+    return alpha, beta, res
+
+
+def semi_dual_obj_grad(alpha, a, b, C, regul):
+    """
+    Compute objective value and gradient of semi-dual objective.
+
+    Parameters
+    ----------
+    alpha: array, shape = len(a)
+        Current iterate of semi-dual potentials.
+    a: array, shape = len(a)
+    b: array, shape = len(b)
+        Input histograms (should be non-negative and sum to 1).
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a max_Omega(X) method.
+
+    Returns
+    -------
+    obj: float
+        Objective value (higher is better).
+    grad: array, shape = len(a)
+        Gradient w.r.t. alpha.
+    """
+    obj = np.dot(alpha, a)
+    grad = a.copy()
+
+    # X[:, j] = alpha - C[:, j]
+    X = alpha[:, np.newaxis] - C
+
+    # val.shape = len(b)
+    # G.shape = len(a) x len(b)
+    val, G = regul.max_Omega(X, b)
+
+    obj -= np.dot(b, val)
+    grad -= np.dot(G, b)
+
+    return obj, grad
+
+
+def solve_semi_dual(a, b, C, regul, method="L-BFGS-B", tol=1e-3, max_iter=500,
+                    verbose=False):
+    """
+    Solve the "smoothed" semi-dual objective.
+
+    Parameters
+    ----------
+    a: array, shape = len(a)
+    b: array, shape = len(b)
+        Input histograms (should be non-negative and sum to 1).
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a max_Omega(X) method.
+    method: str
+        Solver to be used (passed to `scipy.optimize.minimize`).
+    tol: float
+        Tolerance parameter.
+    max_iter: int
+        Maximum number of iterations.
+
+    Returns
+    -------
+    alpha: array, shape = len(a)
+        Semi-dual potentials.
+    """
+
+    def _func(alpha):
+        obj, grad = semi_dual_obj_grad(alpha, a, b, C, regul)
+        # We need to maximize the semi-dual.
+        return -obj, -grad
+
+    alpha_init = np.zeros(len(a))
+
+    res = minimize(_func, alpha_init, method=method, jac=True,
+                   tol=tol, options=dict(maxiter=max_iter, disp=verbose))
+
+    return res.x, res
+
+
+def get_plan_from_dual(alpha, beta, C, regul):
+    """
+    Retrieve optimal transportation plan from optimal dual potentials.
+
+    Parameters
+    ----------
+    alpha: array, shape = len(a)
+    beta: array, shape = len(b)
+        Optimal dual potentials.
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a delta_Omega(X) method.
+
+    Returns
+    -------
+    T: array, shape = len(a) x len(b)
+        Optimal transportation plan.
+    """
+    X = alpha[:, np.newaxis] + beta - C
+    return regul.delta_Omega(X)[1]
+
+
+def get_plan_from_semi_dual(alpha, b, C, regul):
+    """
+    Retrieve optimal transportation plan from optimal semi-dual potentials.
+
+    Parameters
+    ----------
+    alpha: array, shape = len(a)
+        Optimal semi-dual potentials.
+    b: array, shape = len(b)
+        Second input histogram (should be non-negative and sum to 1).
+    C: array, shape = len(a) x len(b)
+        Ground cost matrix.
+    regul: Regularization object
+        Should implement a delta_Omega(X) method.
+
+    Returns
+    -------
+    T: array, shape = len(a) x len(b)
+        Optimal transportation plan.
+    """
+    X = alpha[:, np.newaxis] - C
+    return regul.max_Omega(X, b)[1] * b
+
+
+def smooth_ot_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9,
+                   numItermax=500, verbose=False, log=False):
+    r"""
+    Solve the regularized OT problem in the dual and return the OT matrix
+
+    The function solves the smooth relaxed dual formulation (7) in [17]_ :
+
+    .. math::
+        \max_{\alpha,\beta}\quad a^T\alpha+b^T\beta-\sum_j\delta_\Omega(\alpha+\beta_j-\mathbf{m}_j)
+
+    where :
+
+    - :math:`\mathbf{m}_j` is the jth column of the cost matrix
+    - :math:`\delta_\Omega` is the convex conjugate of the regularization term :math:`\Omega`
+    - a and b are source and target weights (sum to 1)
+
+    The OT matrix can is reconstructed from the gradient of :math:`\delta_\Omega`
+    (See [17]_ Proposition 1).
+    The optimization algorithm is using gradient decent (L-BFGS by default).
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Regularization term >0
+    reg_type : str
+        Regularization type,  can be the following (default ='l2'):
+        - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_)
+        - 'l2' : Squared Euclidean regularization
+    method : str
+        Solver to use for scipy.optimize.minimize
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS).
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.sinhorn : Entropic regularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+
+    if reg_type.lower() in ['l2', 'squaredl2']:
+        regul = SquaredL2(gamma=reg)
+    elif reg_type.lower() in ['entropic', 'negentropy', 'kl']:
+        regul = NegEntropy(gamma=reg)
+    else:
+        raise NotImplementedError('Unknown regularization')
+
+    # solve dual
+    alpha, beta, res = solve_dual(a, b, M, regul, max_iter=numItermax,
+                                  tol=stopThr, verbose=verbose)
+
+    # reconstruct transport matrix
+    G = get_plan_from_dual(alpha, beta, M, regul)
+
+    if log:
+        log = {'alpha': alpha, 'beta': beta, 'res': res}
+        return G, log
+    else:
+        return G
+
+
+def smooth_ot_semi_dual(a, b, M, reg, reg_type='l2', method="L-BFGS-B", stopThr=1e-9,
+                        numItermax=500, verbose=False, log=False):
+    r"""
+    Solve the regularized OT problem in the semi-dual and return the OT matrix
+
+    The function solves the smooth relaxed dual formulation (10) in [17]_ :
+
+    .. math::
+        \max_{\alpha}\quad a^T\alpha-OT_\Omega^*(\alpha,b)
+
+    where :
+
+    .. math::
+        OT_\Omega^*(\alpha,b)=\sum_j b_j
+
+    - :math:`\mathbf{m}_j` is the jth column of the cost matrix
+    - :math:`OT_\Omega^*(\alpha,b)` is defined in Eq. (9) in [17]
+    - a and b are source and target weights (sum to 1)
+
+    The OT matrix can is reconstructed using [17]_ Proposition 2.
+    The optimization algorithm is using gradient decent (L-BFGS by default).
+
+
+    Parameters
+    ----------
+    a : np.ndarray (ns,)
+        samples weights in the source domain
+    b : np.ndarray (nt,) or np.ndarray (nt,nbb)
+        samples in the target domain, compute sinkhorn with multiple targets
+        and fixed M if b is a matrix (return OT loss + dual variables in log)
+    M : np.ndarray (ns,nt)
+        loss matrix
+    reg : float
+        Regularization term >0
+    reg_type : str
+        Regularization type,  can be the following (default ='l2'):
+        - 'kl' : Kullback Leibler (~ Neg-entropy used in sinkhorn [2]_)
+        - 'l2' : Squared Euclidean regularization
+    method : str
+        Solver to use for scipy.optimize.minimize
+    numItermax : int, optional
+        Max number of iterations
+    stopThr : float, optional
+        Stop threshol on error (>0)
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        record log if True
+
+
+    Returns
+    -------
+    gamma : (ns x nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log : dict
+        log dictionary return only if log==True in parameters
+
+
+    References
+    ----------
+
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+
+    .. [17] Blondel, M., Seguy, V., & Rolet, A. (2018). Smooth and Sparse Optimal Transport. Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS).
+
+    See Also
+    --------
+    ot.lp.emd : Unregularized OT
+    ot.sinhorn : Entropic regularized OT
+    ot.optim.cg : General regularized OT
+
+    """
+    if reg_type.lower() in ['l2', 'squaredl2']:
+        regul = SquaredL2(gamma=reg)
+    elif reg_type.lower() in ['entropic', 'negentropy', 'kl']:
+        regul = NegEntropy(gamma=reg)
+    else:
+        raise NotImplementedError('Unknown regularization')
+
+    # solve dual
+    alpha, res = solve_semi_dual(a, b, M, regul, max_iter=numItermax,
+                                 tol=stopThr, verbose=verbose)
+
+    # reconstruct transport matrix
+    G = get_plan_from_semi_dual(alpha, b, M, regul)
+
+    if log:
+        log = {'alpha': alpha, 'res': res}
+        return G, log
+    else:
+        return G
diff --git a/ot/utils.py b/ot/utils.py
index 7dac283..bb21b38 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -77,6 +77,34 @@ def clean_zeros(a, b, M):
     return a2, b2, M2
 
 
+def euclidean_distances(X, Y, squared=False):
+    """
+    Considering the rows of X (and Y=X) as vectors, compute the
+    distance matrix between each pair of vectors.
+    Parameters
+    ----------
+    X : {array-like}, shape (n_samples_1, n_features)
+    Y : {array-like}, shape (n_samples_2, n_features)
+    squared : boolean, optional
+        Return squared Euclidean distances.
+    Returns
+    -------
+    distances : {array}, shape (n_samples_1, n_samples_2)
+    """
+    XX = np.einsum('ij,ij->i', X, X)[:, np.newaxis]
+    YY = np.einsum('ij,ij->i', Y, Y)[np.newaxis, :]
+    distances = np.dot(X, Y.T)
+    distances *= -2
+    distances += XX
+    distances += YY
+    np.maximum(distances, 0, out=distances)
+    if X is Y:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        distances.flat[::distances.shape[0] + 1] = 0.0
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
 def dist(x1, x2=None, metric='sqeuclidean'):
     """Compute distance between samples in x1 and x2 using function scipy.spatial.distance.cdist
 
@@ -104,7 +132,8 @@ def dist(x1, x2=None, metric='sqeuclidean'):
     """
     if x2 is None:
         x2 = x1
-
+    if metric == "sqeuclidean":
+        return euclidean_distances(x1, x2, squared=True)
     return cdist(x1, x2, metric=metric)