1 files changed, 952 insertions, 360 deletions
diff --git a/ot/gromov.py b/ot/gromov.py
index 4427a96..ea667e4 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -14,63 +14,85 @@ import numpy as np
 
 
 from .bregman import sinkhorn
-from .utils import dist, UndefinedParameter
+from .utils import dist, UndefinedParameter, list_to_array
 from .optim import cg
+from .lp import emd_1d, emd
+from .utils import check_random_state
+from .backend import get_backend
 
 
 def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
-    """Return loss matrices and tensors for Gromov-Wasserstein fast computation
+    r"""Return loss matrices and tensors for Gromov-Wasserstein fast computation
 
-    Returns the value of \mathcal{L}(C1,C2) \otimes T with the selected loss
-    function as the loss function of Gromow-Wasserstein discrepancy.
+    Returns the value of :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` with the
+    selected loss function as the loss function of Gromow-Wasserstein discrepancy.
 
-    The matrices are computed as described in Proposition 1 in [12]
+    The matrices are computed as described in Proposition 1 in :ref:`[12] <references-init-matrix>`
 
     Where :
-        * C1 : Metric cost matrix in the source space
-        * C2 : Metric cost matrix in the target space
-        * T : A coupling between those two spaces
-
-    The square-loss function L(a,b)=|a-b|^2 is read as :
-        L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
-            * f1(a)=(a^2)
-            * f2(b)=(b^2)
-            * h1(a)=a
-            * h2(b)=2*b
-
-    The kl-loss function L(a,b)=a*log(a/b)-a+b is read as :
-        L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
-            * f1(a)=a*log(a)-a
-            * f2(b)=b
-            * h1(a)=a
-            * h2(b)=log(b)
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{T}`: A coupling between those two spaces
+
+    The square-loss function :math:`L(a, b) = |a - b|^2` is read as :
+
+    .. math::
+
+        L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
+
+        \mathrm{with} \ f_1(a) &= a^2
+
+                        f_2(b) &= b^2
+
+                        h_1(a) &= a
+
+                        h_2(b) &= 2b
+
+    The kl-loss function :math:`L(a, b) = a \log\left(\frac{a}{b}\right) - a + b` is read as :
+
+    .. math::
+
+        L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
+
+        \mathrm{with} \ f_1(a) &= a \log(a) - a
+
+                        f_2(b) &= b
+
+                        h_1(a) &= a
+
+                        h_2(b) &= \log(b)
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    T :  ndarray, shape (ns, nt)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    T :  array-like, shape (ns, nt)
         Coupling between source and target spaces
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
 
     Returns
     -------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+
 
+    .. _references-init-matrix:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
 
     if loss_fun == 'square_loss':
         def f1(a):
@@ -86,7 +108,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
             return 2 * b
     elif loss_fun == 'kl_loss':
         def f1(a):
-            return a * np.log(a + 1e-15) - a
+            return a * nx.log(a + 1e-15) - a
 
         def f2(b):
             return b
@@ -95,12 +117,16 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
             return a
 
         def h2(b):
-            return np.log(b + 1e-15)
-
-    constC1 = np.dot(np.dot(f1(C1), p.reshape(-1, 1)),
-                     np.ones(len(q)).reshape(1, -1))
-    constC2 = np.dot(np.ones(len(p)).reshape(-1, 1),
-                     np.dot(q.reshape(1, -1), f2(C2).T))
+            return nx.log(b + 1e-15)
+
+    constC1 = nx.dot(
+        nx.dot(f1(C1), nx.reshape(p, (-1, 1))),
+        nx.ones((1, len(q)), type_as=q)
+    )
+    constC2 = nx.dot(
+        nx.ones((len(p), 1), type_as=p),
+        nx.dot(nx.reshape(q, (1, -1)), f2(C2).T)
+    )
     constC = constC1 + constC2
     hC1 = h1(C1)
     hC2 = h2(C2)
@@ -109,61 +135,70 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
 
 
 def tensor_product(constC, hC1, hC2, T):
-    """Return the tensor for Gromov-Wasserstein fast computation
+    r"""Return the tensor for Gromov-Wasserstein fast computation
 
-    The tensor is computed as described in Proposition 1 Eq. (6) in [12].
+    The tensor is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-tensor-product>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
 
     Returns
     -------
-    tens : ndarray, shape (ns, nt)
-        \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
+    tens : array-like, shape (`ns`, `nt`)
+        :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` tensor-matrix multiplication result
+
 
+    .. _references-tensor-product:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
-    A = -np.dot(hC1, T).dot(hC2.T)
+    constC, hC1, hC2, T = list_to_array(constC, hC1, hC2, T)
+    nx = get_backend(constC, hC1, hC2, T)
+
+    A = - nx.dot(
+        nx.dot(hC1, T), hC2.T
+    )
     tens = constC + A
     # tens -= tens.min()
     return tens
 
 
 def gwloss(constC, hC1, hC2, T):
-    """Return the Loss for Gromov-Wasserstein
+    r"""Return the Loss for Gromov-Wasserstein
 
-    The loss is computed as described in Proposition 1 Eq. (6) in [12].
+    The loss is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-gwloss>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
-    T : ndarray, shape (ns, nt)
-        Current value of transport matrix T
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+    T : array-like, shape (ns, nt)
+        Current value of transport matrix :math:`\mathbf{T}`
 
     Returns
     -------
     loss : float
         Gromov Wasserstein loss
 
+
+    .. _references-gwloss:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -171,33 +206,38 @@ def gwloss(constC, hC1, hC2, T):
 
     tens = tensor_product(constC, hC1, hC2, T)
 
-    return np.sum(tens * T)
+    tens, T = list_to_array(tens, T)
+    nx = get_backend(tens, T)
+
+    return nx.sum(tens * T)
 
 
 def gwggrad(constC, hC1, hC2, T):
-    """Return the gradient for Gromov-Wasserstein
+    r"""Return the gradient for Gromov-Wasserstein
 
-    The gradient is computed as described in Proposition 2 in [12].
+    The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
-    T : ndarray, shape (ns, nt)
-        Current value of transport matrix T
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+    T : array-like, shape (ns, nt)
+        Current value of transport matrix :math:`\mathbf{T}`
 
     Returns
     -------
-    grad : ndarray, shape (ns, nt)
+    grad : array-like, shape (`ns`, `nt`)
            Gromov Wasserstein gradient
 
+
+    .. _references-gwggrad:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -207,89 +247,109 @@ def gwggrad(constC, hC1, hC2, T):
 
 
 def update_square_loss(p, lambdas, T, Cs):
-    """
-    Updates C according to the L2 Loss kernel with the S Ts couplings
-    calculated at each iteration
+    r"""
+    Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s`
+    couplings calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         Masses in the targeted barycenter.
     lambdas : list of float
-        List of the S spaces' weights.
-    T : list of S np.ndarray of shape (ns,N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape(ns,ns)
+        List of the `S` spaces' weights.
+    T : list of S array-like of shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape(ns,ns)
         Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (nt, nt)
-        Updated C matrix.
+    C : array-like, shape (`nt`, `nt`)
+        Updated :math:`\mathbf{C}` matrix.
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
-                  for s in range(len(T))])
-    ppt = np.outer(p, p)
+    T = list_to_array(*T)
+    Cs = list_to_array(*Cs)
+    p = list_to_array(p)
+    nx = get_backend(p, *T, *Cs)
 
-    return np.divide(tmpsum, ppt)
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
+
+    return tmpsum / ppt
 
 
 def update_kl_loss(p, lambdas, T, Cs):
-    """
-    Updates C according to the KL Loss kernel with the S Ts couplings calculated at each iteration
+    r"""
+    Updates :math:`\mathbf{C}` according to the KL Loss kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
 
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
+    p  : array-like, shape (N,)
         Weights in the targeted barycenter.
-    lambdas : list of the S spaces' weights
-    T : list of S np.ndarray of shape (ns,N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape(ns,ns)
+    lambdas : list of float
+        List of the `S` spaces' weights
+    T : list of S array-like of shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape(ns,ns)
         Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (ns,ns)
-        updated C matrix
+    C : array-like, shape (`ns`, `ns`)
+        updated :math:`\mathbf{C}` matrix
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
-                  for s in range(len(T))])
-    ppt = np.outer(p, p)
+    Cs = list_to_array(*Cs)
+    T = list_to_array(*T)
+    p = list_to_array(p)
+    nx = get_backend(p, *T, *Cs)
 
-    return np.exp(np.divide(tmpsum, ppt))
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
 
+    return nx.exp(tmpsum / ppt)
 
-def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
-    """
-    Returns the gromov-wasserstein transport between (C1,p) and (C2,q)
+
+def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, **kwargs):
+    r"""
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p : ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,)
         Distribution in the source space
-    q : ndarray, shape (nt,)
+    q : array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str
         loss function used for the solver either 'square_loss' or 'kl_loss'
-
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -299,22 +359,23 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     log : bool, optional
         record log if True
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
     **kwargs : dict
         parameters can be directly passed to the ot.optim.cg solver
 
     Returns
     -------
-    T : ndarray, shape (ns, nt)
-        Doupling between the two spaces that minimizes:
-            \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+    T : array-like, shape (`ns`, `nt`)
+        Coupling between the two spaces that minimizes:
+
+            :math:`\sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}`
     log : dict
         Convergence information and loss.
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -323,6 +384,15 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
         mathematics 11.4 (2011): 417-487.
 
     """
+    p, q = list_to_array(p, q)
+
+    p0, q0, C10, C20 = p, q, C1, C2
+    nx = get_backend(p0, q0, C10, C20)
+
+    p = nx.to_numpy(p)
+    q = nx.to_numpy(q)
+    C1 = nx.to_numpy(C10)
+    C2 = nx.to_numpy(C20)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -336,37 +406,45 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
 
     if log:
         res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
-        log['gw_dist'] = gwloss(constC, hC1, hC2, res)
-        return res, log
+        log['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, res), type_as=C10)
+        log['u'] = nx.from_numpy(log['u'], type_as=C10)
+        log['v'] = nx.from_numpy(log['v'], type_as=C10)
+        return nx.from_numpy(res, type_as=C10), log
     else:
-        return cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+        return nx.from_numpy(cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=False, **kwargs), type_as=C10)
 
 
-def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
-    """
-    Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
+def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, **kwargs):
+    r"""
+    Returns the gromov-wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity
+      matrices
+
+    Note that when using backends, this loss function is differentiable wrt the
+    marices and weights for quadratic loss using the gradients from [38]_.
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
+    C2 : array-like, shape (nt, nt)
         Metric cost matrix in the target space
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
         Distribution in the source space.
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space.
     loss_fun :  str
         loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -379,8 +457,8 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     log : bool, optional
         record log if True
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
 
     Returns
     -------
@@ -391,7 +469,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -399,7 +477,20 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
         metric approach to object matching. Foundations of computational
         mathematics 11.4 (2011): 417-487.
 
+    .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online
+        Graph Dictionary Learning, International Conference on Machine Learning
+        (ICML), 2021.
+
     """
+    p, q = list_to_array(p, q)
+
+    p0, q0, C10, C20 = p, q, C1, C2
+    nx = get_backend(p0, q0, C10, C20)
+
+    p = nx.to_numpy(p)
+    q = nx.to_numpy(q)
+    C1 = nx.to_numpy(C10)
+    C2 = nx.to_numpy(C20)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -410,53 +501,71 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
 
     def df(G):
         return gwggrad(constC, hC1, hC2, G)
-    res, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
-    log_gw['gw_dist'] = gwloss(constC, hC1, hC2, res)
-    log_gw['T'] = res
+
+    T, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+
+    T0 = nx.from_numpy(T, type_as=C10)
+
+    log_gw['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, T), type_as=C10)
+    log_gw['u'] = nx.from_numpy(log_gw['u'], type_as=C10)
+    log_gw['v'] = nx.from_numpy(log_gw['v'], type_as=C10)
+    log_gw['T'] = T0
+
+    gw = log_gw['gw_dist']
+
+    if loss_fun == 'square_loss':
+        gC1 = nx.from_numpy(2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T))
+        gC2 = nx.from_numpy(2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T))
+        gw = nx.set_gradients(gw, (p0, q0, C10, C20),
+                              (log_gw['u'], log_gw['v'], gC1, gC2))
+
     if log:
-        return log_gw['gw_dist'], log_gw
+        return gw, log_gw
     else:
-        return log_gw['gw_dist']
+        return gw
 
 
 def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
-    """
-    Computes the FGW transport between two graphs see [24]
+    r"""
+    Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
 
     .. math::
-        \gamma = arg\min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
-        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
-        s.t. \gamma 1 = p
-             \gamma^T 1= q
-             \gamma\geq 0
+        s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{\gamma} &\geq 0
 
     where :
-    - M is the (ns,nt) metric cost matrix
-    - p and q are source and target weights (sum to 1)
-    - L is a loss function to account for the misfit between the similarity matrices
 
-    The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
+    - `L` is a loss function to account for the misfit between the similarity matrices
+
+    The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>`
 
     Parameters
     ----------
-    M : ndarray, shape (ns, nt)
+    M : array-like, shape (ns, nt)
         Metric cost matrix between features across domains
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix representative of the structure in the source space
-    C2 : ndarray, shape (nt, nt)
+    C2 : array-like, shape (nt, nt)
         Metric cost matrix representative of the structure in the target space
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
         Distribution in the source space
-    q : ndarray, shape (nt,)
+    q : array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str, optional
         Loss function used for the solver
     alpha : float, optional
         Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
     log : bool, optional
         record log if True
     **kwargs : dict
@@ -464,18 +573,30 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     Returns
     -------
-    gamma : ndarray, shape (ns, nt)
+    gamma : array-like, shape (`ns`, `nt`)
         Optimal transportation matrix for the given parameters.
     log : dict
         Log dictionary return only if log==True in parameters.
 
+
+    .. _references-fused-gromov-wasserstein:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas "Optimal Transport for structured data with
         application on graphs", International Conference on Machine Learning
         (ICML). 2019.
     """
+    p, q = list_to_array(p, q)
+
+    p0, q0, C10, C20, M0 = p, q, C1, C2, M
+    nx = get_backend(p0, q0, C10, C20, M0)
+
+    p = nx.to_numpy(p)
+    q = nx.to_numpy(q)
+    C1 = nx.to_numpy(C10)
+    C2 = nx.to_numpy(C20)
+    M = nx.to_numpy(M0)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -489,69 +610,98 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     if log:
         res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
-        log['fgw_dist'] = log['loss'][::-1][0]
-        return res, log
+
+        fgw_dist = nx.from_numpy(log['loss'][-1], type_as=C10)
+
+        log['fgw_dist'] = fgw_dist
+        log['u'] = nx.from_numpy(log['u'], type_as=C10)
+        log['v'] = nx.from_numpy(log['v'], type_as=C10)
+        return nx.from_numpy(res, type_as=C10), log
+
     else:
-        return cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+        return nx.from_numpy(cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs), type_as=C10)
 
 
 def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
-    """
-    Computes the FGW distance between two graphs see [24]
+    r"""
+    Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
 
     .. math::
-        \min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
-        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
 
+             \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
 
-        s.t. \gamma 1 = p
-             \gamma^T 1= q
-             \gamma\geq 0
+             \mathbf{\gamma} &\geq 0
 
     where :
-    - M is the (ns,nt) metric cost matrix
-    - p and q are source and target weights (sum to 1)
-    - L is a loss function to account for the misfit between the similarity matrices
-    The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
+    - `L` is a loss function to account for the misfit between the similarity matrices
+
+    The algorithm used for solving the problem is conditional gradient as
+    discussed in :ref:`[24] <references-fused-gromov-wasserstein2>`
+
+    Note that when using backends, this loss function is differentiable wrt the
+    marices and weights for quadratic loss using the gradients from [38]_.
 
     Parameters
     ----------
-    M : ndarray, shape (ns, nt)
+    M : array-like, shape (ns, nt)
         Metric cost matrix between features across domains
-    C1 : ndarray, shape (ns, ns)
-        Metric cost matrix respresentative of the structure in the source space.
-    C2 : ndarray, shape (nt, nt)
-        Metric cost matrix espresentative of the structure in the target space.
-    p :  ndarray, shape (ns,)
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix representative of the structure in the source space.
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix representative of the structure in the target space.
+    p :  array-like, shape (ns,)
         Distribution in the source space.
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space.
     loss_fun : str, optional
         Loss function used for the solver.
     alpha : float, optional
         Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research.
-        Else closed form is used. If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research.
+        Else closed form is used. If there are convergence issues use False.
     log : bool, optional
         Record log if True.
     **kwargs : dict
-        Parameters can be directly pased to the ot.optim.cg solver.
+        Parameters can be directly passed to the ot.optim.cg solver.
 
     Returns
     -------
-    gamma : ndarray, shape (ns, nt)
-        Optimal transportation matrix for the given parameters.
+    fgw-distance : float
+        Fused gromov wasserstein distance for the given parameters.
     log : dict
         Log dictionary return only if log==True in parameters.
 
+
+    .. _references-fused-gromov-wasserstein2:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
+
+    .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online
+        Graph Dictionary Learning, International Conference on Machine Learning
+        (ICML), 2021.
     """
+    p, q = list_to_array(p, q)
+
+    p0, q0, C10, C20, M0 = p, q, C1, C2, M
+    nx = get_backend(p0, q0, C10, C20, M0)
+
+    p = nx.to_numpy(p)
+    q = nx.to_numpy(q)
+    C1 = nx.to_numpy(C10)
+    C2 = nx.to_numpy(C20)
+    M = nx.to_numpy(M0)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -563,50 +713,462 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
     def df(G):
         return gwggrad(constC, hC1, hC2, G)
 
-    res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+    T, log_fgw = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+
+    fgw_dist = nx.from_numpy(log_fgw['loss'][-1], type_as=C10)
+
+    T0 = nx.from_numpy(T, type_as=C10)
+
+    log_fgw['fgw_dist'] = fgw_dist
+    log_fgw['u'] = nx.from_numpy(log_fgw['u'], type_as=C10)
+    log_fgw['v'] = nx.from_numpy(log_fgw['v'], type_as=C10)
+    log_fgw['T'] = T0
+
+    if loss_fun == 'square_loss':
+        gC1 = nx.from_numpy(2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T))
+        gC2 = nx.from_numpy(2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T))
+        fgw_dist = nx.set_gradients(fgw_dist, (p0, q0, C10, C20, M0),
+                                    (log_fgw['u'], log_fgw['v'], alpha * gC1, alpha * gC2, (1 - alpha) * T0))
+
     if log:
-        log['fgw_dist'] = log['loss'][::-1][0]
-        log['T'] = res
-        return log['fgw_dist'], log
+        return fgw_dist, log_fgw
     else:
-        return log['fgw_dist']
+        return fgw_dist
 
 
-def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
-                                max_iter=1000, tol=1e-9, verbose=False, log=False):
+def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
+                           nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
+    r"""
+    Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    with a fixed transport plan :math:`\mathbf{T}`.
+
+    The function gives an unbiased approximation of the following equation:
+
+    .. math::
+
+        GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - `L` : Loss function to account for the misfit between the similarity matrices
+    - :math:`\mathbf{T}`: Matrix with marginal :math:`\mathbf{p}` and :math:`\mathbf{q}`
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    T : csr or array-like, shape (ns, nt)
+        Transport plan matrix, either a sparse csr or a dense matrix
+    nb_samples_p : int, optional
+        `nb_samples_p` is the number of samples (without replacement) along the first dimension of :math:`\mathbf{T}`
+    nb_samples_q : int, optional
+        `nb_samples_q` is the number of samples along the second dimension of :math:`\mathbf{T}`, for each sample along the first
+    std : bool, optional
+        Standard deviation associated with the prediction of the gromov-wasserstein cost
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    : float
+        Gromov-wasserstein cost
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
+    """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
+    generator = check_random_state(random_state)
+
+    len_p = p.shape[0]
+    len_q = q.shape[0]
+
+    # It is always better to sample from the biggest distribution first.
+    if len_p < len_q:
+        p, q = q, p
+        len_p, len_q = len_q, len_p
+        C1, C2 = C2, C1
+        T = T.T
+
+    if nb_samples_p is None:
+        if nx.issparse(T):
+            # If T is sparse, it probably mean that PoGroW was used, thus the number of sample is reduced
+            nb_samples_p = min(int(5 * (len_p * np.log(len_p)) ** 0.5), len_p)
+        else:
+            nb_samples_p = len_p
+    else:
+        # The number of sample along the first dimension is without replacement.
+        nb_samples_p = min(nb_samples_p, len_p)
+    if nb_samples_q is None:
+        nb_samples_q = 1
+    if std:
+        nb_samples_q = max(2, nb_samples_q)
+
+    index_k = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
+    index_l = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
+
+    index_i = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
+    index_j = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
+
+    for i in range(nb_samples_p):
+        if nx.issparse(T):
+            T_indexi = nx.reshape(nx.todense(T[index_i[i], :]), (-1,))
+            T_indexj = nx.reshape(nx.todense(T[index_j[i], :]), (-1,))
+        else:
+            T_indexi = T[index_i[i], :]
+            T_indexj = T[index_j[i], :]
+        # For each of the row sampled, the column is sampled.
+        index_k[i] = generator.choice(
+            len_q,
+            size=nb_samples_q,
+            p=T_indexi / nx.sum(T_indexi),
+            replace=True
+        )
+        index_l[i] = generator.choice(
+            len_q,
+            size=nb_samples_q,
+            p=T_indexj / nx.sum(T_indexj),
+            replace=True
+        )
+
+    list_value_sample = nx.stack([
+        loss_fun(
+            C1[np.ix_(index_i, index_j)],
+            C2[np.ix_(index_k[:, n], index_l[:, n])]
+        ) for n in range(nb_samples_q)
+    ], axis=2)
+
+    if std:
+        std_value = nx.sum(nx.std(list_value_sample, axis=2) ** 2) ** 0.5
+        return nx.mean(list_value_sample), std_value / (nb_samples_p * nb_samples_p)
+    else:
+        return nx.mean(list_value_sample)
+
+
+def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
+                                 alpha=1, max_iter=100, threshold_plan=0, log=False, verbose=False, random_state=None):
+    r"""
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a stochastic Frank-Wolfe.
+    This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times PN^2)` time complexity with `P` the number of Sinkhorn iterations.
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+                \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+                \mathbf{T} &\geq 0
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    alpha : float
+        Step of the Frank-Wolfe algorithm, should be between 0 and 1
+    max_iter : int, optional
+        Max number of iterations
+    threshold_plan : float, optional
+        Deleting very small values in the transport plan. If above zero, it violates the marginal constraints.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Gives the distance estimated and the standard deviation
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two spaces
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
     """
-    Returns the gromov-wasserstein transport between (C1,p) and (C2,q)
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
+    len_p = p.shape[0]
+    len_q = q.shape[0]
+
+    generator = check_random_state(random_state)
+
+    index = np.zeros(2, dtype=int)
 
-    (C1,p) and (C2,q)
+    # Initialize with default marginal
+    index[0] = generator.choice(len_p, size=1, p=p)
+    index[1] = generator.choice(len_q, size=1, p=q)
+    T = nx.tocsr(emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False))
+
+    best_gw_dist_estimated = np.inf
+    for cpt in range(max_iter):
+        index[0] = generator.choice(len_p, size=1, p=p)
+        T_index0 = nx.reshape(nx.todense(T[index[0], :]), (-1,))
+        index[1] = generator.choice(len_q, size=1, p=T_index0 / T_index0.sum())
+
+        if alpha == 1:
+            T = nx.tocsr(
+                emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
+            )
+        else:
+            new_T = nx.tocsr(
+                emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
+            )
+            T = (1 - alpha) * T + alpha * new_T
+            # To limit the number of non 0, the values below the threshold are set to 0.
+            T = nx.eliminate_zeros(T, threshold=threshold_plan)
+
+        if cpt % 10 == 0 or cpt == (max_iter - 1):
+            gw_dist_estimated = GW_distance_estimation(
+                C1=C1, C2=C2, loss_fun=loss_fun,
+                p=p, q=q, T=T, std=False, random_state=generator
+            )
+
+            if gw_dist_estimated < best_gw_dist_estimated:
+                best_gw_dist_estimated = gw_dist_estimated
+                best_T = nx.copy(T)
+
+            if verbose:
+                if cpt % 200 == 0:
+                    print('{:5s}|{:12s}'.format('It.', 'Best gw estimated') + '\n' + '-' * 19)
+                print('{:5d}|{:8e}|'.format(cpt, best_gw_dist_estimated))
+
+    if log:
+        log = {}
+        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
+            C1=C1, C2=C2, loss_fun=loss_fun,
+            p=p, q=q, T=best_T, random_state=generator
+        )
+        return best_T, log
+    return best_T
+
+
+def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
+                               nb_samples_grad=100, epsilon=1, max_iter=500, log=False, verbose=False,
+                               random_state=None):
+    r"""
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a 1-stochastic Frank-Wolfe.
+    This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times N \log(N))` time complexity by relying on the 1D Optimal Transport solver.
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
-        s.t. T 1 = p
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
-             T^T 1= q
+                \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
-             T\geq 0
+                \mathbf{T} &\geq 0
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
         Distribution in the source space
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    nb_samples_grad : int
+        Number of samples to approximate the gradient
+    epsilon : float
+        Weight of the Kullback-Leibler regularization
+    max_iter : int, optional
+        Max number of iterations
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Gives the distance estimated and the standard deviation
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two spaces
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
+    """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
+    len_p = p.shape[0]
+    len_q = q.shape[0]
+
+    generator = check_random_state(random_state)
+
+    # The most natural way to define nb_sample is with a simple integer.
+    if isinstance(nb_samples_grad, int):
+        if nb_samples_grad > len_p:
+            # As the sampling along the first dimension is done without replacement, the rest is reported to the second
+            # dimension.
+            nb_samples_grad_p, nb_samples_grad_q = len_p, nb_samples_grad // len_p
+        else:
+            nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad, 1
+    else:
+        nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad
+    T = nx.outer(p, q)
+    # continue_loop allows to stop the loop if there is several successive small modification of T.
+    continue_loop = 0
+
+    # The gradient of GW is more complex if the two matrices are not symmetric.
+    C_are_symmetric = nx.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and nx.allclose(C2, C2.T, rtol=1e-10, atol=1e-10)
+
+    for cpt in range(max_iter):
+        index0 = generator.choice(len_p, size=nb_samples_grad_p, p=p, replace=False)
+        Lik = 0
+        for i, index0_i in enumerate(index0):
+            index1 = generator.choice(len_q,
+                                      size=nb_samples_grad_q,
+                                      p=T[index0_i, :] / nx.sum(T[index0_i, :]),
+                                      replace=False)
+            # If the matrices C are not symmetric, the gradient has 2 terms, thus the term is chosen randomly.
+            if (not C_are_symmetric) and generator.rand(1) > 0.5:
+                Lik += nx.mean(loss_fun(
+                    C1[:, [index0[i]] * nb_samples_grad_q][:, None, :],
+                    C2[:, index1][None, :, :]
+                ), axis=2)
+            else:
+                Lik += nx.mean(loss_fun(
+                    C1[[index0[i]] * nb_samples_grad_q, :][:, :, None],
+                    C2[index1, :][:, None, :]
+                ), axis=0)
+
+        max_Lik = nx.max(Lik)
+        if max_Lik == 0:
+            continue
+        # This division by the max is here to facilitate the choice of epsilon.
+        Lik /= max_Lik
+
+        if epsilon > 0:
+            # Set to infinity all the numbers below exp(-200) to avoid log of 0.
+            log_T = nx.log(nx.clip(T, np.exp(-200), 1))
+            log_T = nx.where(log_T == -200, -np.inf, log_T)
+            Lik = Lik - epsilon * log_T
+
+            try:
+                new_T = sinkhorn(a=p, b=q, M=Lik, reg=epsilon)
+            except (RuntimeWarning, UserWarning):
+                print("Warning catched in Sinkhorn: Return last stable T")
+                break
+        else:
+            new_T = emd(a=p, b=q, M=Lik)
+
+        change_T = nx.mean((T - new_T) ** 2)
+        if change_T <= 10e-20:
+            continue_loop += 1
+            if continue_loop > 100:  # Number max of low modifications of T
+                T = nx.copy(new_T)
+                break
+        else:
+            continue_loop = 0
+
+        if verbose and cpt % 10 == 0:
+            if cpt % 200 == 0:
+                print('{:5s}|{:12s}'.format('It.', '||T_n - T_{n+1}||') + '\n' + '-' * 19)
+            print('{:5d}|{:8e}|'.format(cpt, change_T))
+        T = nx.copy(new_T)
+
+    if log:
+        log = {}
+        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
+            C1=C1, C2=C2, loss_fun=loss_fun,
+            p=p, q=q, T=T, random_state=generator
+        )
+        return T, log
+    return T
+
+
+def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
+                                max_iter=1000, tol=1e-9, verbose=False, log=False):
+    r"""
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{T} &\geq 0
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+    - `H`: entropy
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
         Distribution in the target space
     loss_fun :  string
         Loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -623,21 +1185,20 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 
     Returns
     -------
-    T : ndarray, shape (ns, nt)
+    T : array-like, shape (`ns`, `nt`)
         Optimal coupling between the two spaces
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
 
-    C1 = np.asarray(C1, dtype=np.float64)
-    C2 = np.asarray(C2, dtype=np.float64)
-
-    T = np.outer(p, q)  # Initialization
+    T = nx.outer(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -654,12 +1215,12 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
         # compute the gradient
         tens = gwggrad(constC, hC1, hC2, T)
 
-        T = sinkhorn(p, q, tens, epsilon)
+        T = sinkhorn(p, q, tens, epsilon, method='sinkhorn')
 
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(T - Tprev)
+            err = nx.norm(T - Tprev)
 
             if log:
                 log['err'].append(err)
@@ -681,33 +1242,33 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 
 def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
                                  max_iter=1000, tol=1e-9, verbose=False, log=False):
-    """
-    Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices
-
-    (C1,p) and (C2,q)
+    r"""
+    Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})
+        \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+    - `H`: entropy
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
         Distribution in the source space
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str
         Loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -729,7 +1290,7 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -746,76 +1307,79 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
 
 
 def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
-                                max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None):
-    """
-    Returns the gromov-wasserstein barycenters of S measured similarity matrices
-
-    (Cs)_{s=1}^{s=S}
+                                max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
+    r"""
+    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
     The function solves the following optimization problem:
 
     .. math::
-        C = argmin_{C\in R^{NxN}} \sum_s \lambda_s GW(C,C_s,p,p_s)
 
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
-    - :math:`C_s` : metric cost matrix
-    - :math:`p_s`  : distribution
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
 
     Parameters
     ----------
     N : int
         Size of the targeted barycenter
-    Cs : list of S np.ndarray of shape (ns,ns)
+    Cs : list of S array-like of shape (ns,ns)
         Metric cost matrices
-    ps : list of S np.ndarray of shape (ns,)
-        Sample weights in the S spaces
-    p : ndarray, shape(N,)
+    ps : list of S array-like of shape (ns,)
+        Sample weights in the `S` spaces
+    p : array-like, shape(N,)
         Weights in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights.
+        List of the `S` spaces' weights.
     loss_fun : callable
         Tensor-matrix multiplication function based on specific loss function.
     update : callable
-        function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-        with the S Ts couplings calculated at each iteration
+        function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
+        :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
+        calculated at each iteration
     epsilon : float
         Regularization term >0
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0)
+        Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : bool | ndarray, shape (N, N)
-        Random initial value for the C matrix provided by user.
+    init_C : bool | array-like, shape (N, N)
+        Random initial value for the :math:`\mathbf{C}` matrix provided by user.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
     """
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *ps, p)
 
     S = len(Cs)
 
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    lambdas = np.asarray(lambdas, dtype=np.float64)
-
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
-        # XXX use random state
-        xalea = np.random.randn(N, 2)
+        generator = check_random_state(random_state)
+        xalea = generator.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
+        C = nx.from_numpy(C, type_as=p)
     else:
         C = init_C
 
@@ -828,7 +1392,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
         Cprev = C
 
         T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
-                                         max_iter, 1e-5, verbose, log) for s in range(S)]
+                                         max_iter, 1e-4, verbose, log) for s in range(S)]
         if loss_fun == 'square_loss':
             C = update_square_loss(p, lambdas, T, Cs)
 
@@ -838,7 +1402,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(C - Cprev)
+            err = nx.norm(C - Cprev)
             error.append(err)
 
             if log:
@@ -856,72 +1420,78 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
 
 def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
-                       max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None):
-    """
-    Returns the gromov-wasserstein barycenters of S measured similarity matrices
-
-    (Cs)_{s=1}^{s=S}
+                       max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
+    r"""
+    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
-    The function solves the following optimization problem with block
-    coordinate descent:
+    The function solves the following optimization problem with block coordinate descent:
 
     .. math::
-        C = argmin_C\in R^NxN \sum_s \lambda_s GW(C,Cs,p,ps)
+
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
-    - Cs : metric cost matrix
-    - ps  : distribution
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
 
     Parameters
     ----------
     N : int
         Size of the targeted barycenter
-    Cs : list of S np.ndarray of shape (ns, ns)
+    Cs : list of S array-like of shape (ns, ns)
         Metric cost matrices
-    ps : list of S np.ndarray of shape (ns,)
-        Sample weights in the S spaces
-    p : ndarray, shape (N,)
+    ps : list of S array-like of shape (ns,)
+        Sample weights in the `S` spaces
+    p : array-like, shape (N,)
         Weights in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights
-    loss_fun :  tensor-matrix multiplication function based on specific loss function
-    update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-             with the S Ts couplings calculated at each iteration
+        List of the `S` spaces' weights
+    loss_fun : callable
+        tensor-matrix multiplication function based on specific loss function
+    update : callable
+        function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
+        :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
+        calculated at each iteration
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0).
+        Stop threshold on error (>0).
     verbose : bool, optional
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : bool | ndarray, shape(N,N)
-        Random initial value for the C matrix provided by user.
+    init_C : bool | array-like, shape(N,N)
+        Random initial value for the :math:`\mathbf{C}` matrix provided by user.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
-    S = len(Cs)
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *ps, p)
 
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    lambdas = np.asarray(lambdas, dtype=np.float64)
+    S = len(Cs)
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
-        # XXX : should use a random state and not use the global seed
-        xalea = np.random.randn(N, 2)
+        generator = check_random_state(random_state)
+        xalea = generator.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
+        C = nx.from_numpy(C, type_as=p)
     else:
         C = init_C
 
@@ -944,7 +1514,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(C - Cprev)
+            err = nx.norm(C - Cprev)
             error.append(err)
 
             if log:
@@ -963,21 +1533,21 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
 def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
                     p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
-                    verbose=False, log=False, init_C=None, init_X=None):
-    """Compute the fgw barycenter as presented eq (5) in [24].
+                    verbose=False, log=False, init_C=None, init_X=None, random_state=None):
+    r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>`
 
     Parameters
     ----------
-    N : integer
+    N : int
         Desired number of samples of the target barycenter
-    Ys: list of ndarray, each element has shape (ns,d)
+    Ys: list of array-like, each element has shape (ns,d)
         Features of all samples
-    Cs : list of ndarray, each element has shape (ns,ns)
+    Cs : list of array-like, each element has shape (ns,ns)
         Structure matrices of all samples
-    ps : list of ndarray, each element has shape (ns,)
+    ps : list of array-like, each element has shape (ns,)
         Masses of all samples.
     lambdas : list of float
-        List of the S spaces' weights
+        List of the `S` spaces' weights
     alpha : float
         Alpha parameter for the fgw distance
     fixed_structure : bool
@@ -989,46 +1559,51 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
-        Stop threshol on error (>0).
+        Stop threshold on error (>0).
     verbose : bool, optional
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : ndarray, shape (N,N), optional
+    init_C : array-like, shape (N,N), optional
         Initialization for the barycenters' structure matrix. If not set
         a random init is used.
-    init_X : ndarray, shape (N,d), optional
+    init_X : array-like, shape (N,d), optional
         Initialization for the barycenters' features. If not set a
         random init is used.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    X : ndarray, shape (N, d)
+    X : array-like, shape (`N`, `d`)
         Barycenters' features
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Barycenters' structure matrix
-    log_: dict
+    log : dict
         Only returned when log=True. It contains the keys:
-        T : list of (N,ns) transport matrices
-        Ms : all distance matrices between the feature of the barycenter and the
-        other features dist(X,Ys) shape (N,ns)
 
+        - :math:`\mathbf{T}`: list of (`N`, `ns`) transport matrices
+        - :math:`(\mathbf{M}_s)_s`: all distance matrices between the feature of the barycenter and the other features :math:`(dist(\mathbf{X}, \mathbf{Y}_s))_s` shape (`N`, `ns`)
+
+
+    .. _references-fgw-barycenters:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    Ys = list_to_array(*Ys)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *Ys, *ps)
+
     S = len(Cs)
     d = Ys[0].shape[1]  # dimension on the node features
     if p is None:
-        p = np.ones(N) / N
-
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    Ys = [np.asarray(Ys[s], dtype=np.float64) for s in range(S)]
-
-    lambdas = np.asarray(lambdas, dtype=np.float64)
+        p = nx.ones(N, type_as=Cs[0]) / N
 
     if fixed_structure:
         if init_C is None:
@@ -1037,8 +1612,10 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             C = init_C
     else:
         if init_C is None:
-            xalea = np.random.randn(N, 2)
+            generator = check_random_state(random_state)
+            xalea = generator.randn(N, 2)
             C = dist(xalea, xalea)
+            C = nx.from_numpy(C, type_as=ps[0])
         else:
             C = init_C
 
@@ -1049,13 +1626,13 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             X = init_X
     else:
         if init_X is None:
-            X = np.zeros((N, d))
+            X = nx.zeros((N, d), type_as=ps[0])
         else:
             X = init_X
 
-    T = [np.outer(p, q) for q in ps]
+    T = [nx.outer(p, q) for q in ps]
 
-    Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]  # Ms is N,ns
+    Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
 
     cpt = 0
     err_feature = 1
@@ -1075,20 +1652,19 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             Ys_temp = [y.T for y in Ys]
             X = update_feature_matrix(lambdas, Ys_temp, T, p).T
 
-        Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]
+        Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
 
         if not fixed_structure:
             if loss_fun == 'square_loss':
                 T_temp = [t.T for t in T]
-                C = update_sructure_matrix(p, lambdas, T_temp, Cs)
+                C = update_structure_matrix(p, lambdas, T_temp, Cs)
 
         T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
                                       numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
 
         # T is N,ns
-        err_feature = np.linalg.norm(X - Xprev.reshape(N, d))
-        err_structure = np.linalg.norm(C - Cprev)
-
+        err_feature = nx.norm(X - nx.reshape(Xprev, (N, d)))
+        err_structure = nx.norm(C - Cprev)
         if log:
             log_['err_feature'].append(err_feature)
             log_['err_structure'].append(err_structure)
@@ -1114,64 +1690,80 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
         return X, C
 
 
-def update_sructure_matrix(p, lambdas, T, Cs):
-    """Updates C according to the L2 Loss kernel with the S Ts couplings.
+def update_structure_matrix(p, lambdas, T, Cs):
+    r"""Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings.
 
     It is calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         Masses in the targeted barycenter.
     lambdas : list of float
-        List of the S spaces' weights.
-    T : list of S ndarray of shape (ns, N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape (ns, ns)
-         Metric cost matrices.
+        List of the `S` spaces' weights.
+    T : list of S array-like of shape (ns, N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape (ns, ns)
+        Metric cost matrices.
 
     Returns
     -------
-    C : ndarray, shape (nt, nt)
-        Updated C matrix.
+    C : array-like, shape (`nt`, `nt`)
+        Updated :math:`\mathbf{C}` matrix.
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s]) for s in range(len(T))])
-    ppt = np.outer(p, p)
+    p = list_to_array(p)
+    T = list_to_array(*T)
+    Cs = list_to_array(*Cs)
+    nx = get_backend(*Cs, *T, p)
 
-    return np.divide(tmpsum, ppt)
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
+    return tmpsum / ppt
 
 
 def update_feature_matrix(lambdas, Ys, Ts, p):
-    """Updates the feature with respect to the S Ts couplings.
+    r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings.
 
 
     See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
-    in [24] calculated at each iteration
+    in :ref:`[24] <references-update-feature-matrix>` calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         masses in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights
-    Ts : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
-    Ys : list of S ndarray, shape(d,ns)
+        List of the `S` spaces' weights
+    Ts : list of S array-like, shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
+    Ys : list of S array-like, shape (d,ns)
         The features.
 
     Returns
     -------
-    X : ndarray, shape (d, N)
+    X : array-like, shape (`d`, `N`)
 
+
+    .. _references-update-feature-matrix:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
-    p = np.array(1. / p).reshape(-1,)
-
-    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts))])
-
+    p = list_to_array(p)
+    Ts = list_to_array(*Ts)
+    Ys = list_to_array(*Ys)
+    nx = get_backend(*Ys, *Ts, p)
+
+    p = 1. / p
+    tmpsum = sum([
+        lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :]
+        for s in range(len(Ts))
+    ])
     return tmpsum