[MRG] Backend for gromov (#294)

* bregman: small correction * gromov backend first draft * Removing decorators * Reworked casting method * Bug solve * Removing casting * Bug solve * toarray renamed todense ; expand_dims removed * Warning (jax not supporting sparse matrix) moved * Mistake corrected * test backend * Sparsity test for older versions of pytorch * Trying pytorch/1.10 * Attempt to correct torch sparse bug * Backend version of gromov tests * Random state introduced for remaining gromov functions * review changes * code coverage * Docs (first draft, to be continued) * Gromov docs * Prettified docs * mistake corrected in the docs * little change Co-authored-by: Rémi Flamary <remi.flamary@gmail.com>
author: ncassereau-idris <84033440+ncassereau-idris@users.noreply.github.com> 2021-11-02 13:42:02 +0100
committer: GitHub <noreply@github.com> 2021-11-02 13:42:02 +0100
commit: a335324d008e8982be61d7ace937815a2bfa98f9 (patch)
tree: 83c7f637597f10f6f3d20b15532e53fc65b51f22
parent: 0cb2b2efe901ed74c614046d250518769f870313 (diff)
8 files changed, 1289 insertions, 766 deletions
diff --git a/ot/backend.py b/ot/backend.py
index 876b96a..358297c 100644
--- a/ot/backend.py
+++ b/ot/backend.py
@@ -26,6 +26,7 @@ Examples
 
 import numpy as np
 import scipy.special as scipy
+from scipy.sparse import issparse, coo_matrix, csr_matrix
 
 try:
     import torch
@@ -539,6 +540,86 @@ class Backend():
         """
         raise NotImplementedError()
 
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        r"""
+        Creates a sparse tensor in COOrdinate format.
+
+        This function follows the api from :any:`scipy.sparse.coo_matrix`
+
+        See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html
+        """
+        raise NotImplementedError()
+
+    def issparse(self, a):
+        r"""
+        Checks whether or not the input tensor is a sparse tensor.
+
+        This function follows the api from :any:`scipy.sparse.issparse`
+
+        See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.issparse.html
+        """
+        raise NotImplementedError()
+
+    def tocsr(self, a):
+        r"""
+        Converts this matrix to Compressed Sparse Row format.
+
+        This function follows the api from :any:`scipy.sparse.coo_matrix.tocsr`
+
+        See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.tocsr.html
+        """
+        raise NotImplementedError()
+
+    def eliminate_zeros(self, a, threshold=0.):
+        r"""
+        Removes entries smaller than the given threshold from the sparse tensor.
+
+        This function follows the api from :any:`scipy.sparse.csr_matrix.eliminate_zeros`
+
+        See: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.eliminate_zeros.html
+        """
+        raise NotImplementedError()
+
+    def todense(self, a):
+        r"""
+        Converts a sparse tensor to a dense tensor.
+
+        This function follows the api from :any:`scipy.sparse.csr_matrix.toarray`
+
+        See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.toarray.html
+        """
+        raise NotImplementedError()
+
+    def where(self, condition, x, y):
+        r"""
+        Returns elements chosen from x or y depending on condition.
+
+        This function follows the api from :any:`numpy.where`
+
+        See: https://numpy.org/doc/stable/reference/generated/numpy.where.html
+        """
+        raise NotImplementedError()
+
+    def copy(self, a):
+        r"""
+        Returns a copy of the given tensor.
+
+        This function follows the api from :any:`numpy.copy`
+
+        See: https://numpy.org/doc/stable/reference/generated/numpy.copy.html
+        """
+        raise NotImplementedError()
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        r"""
+        Returns True if two arrays are element-wise equal within a tolerance.
+
+        This function follows the api from :any:`numpy.allclose`
+
+        See: https://numpy.org/doc/stable/reference/generated/numpy.allclose.html
+        """
+        raise NotImplementedError()
+
 
 class NumpyBackend(Backend):
     """
@@ -712,6 +793,46 @@ class NumpyBackend(Backend):
     def reshape(self, a, shape):
         return np.reshape(a, shape)
 
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        if type_as is None:
+            return coo_matrix((data, (rows, cols)), shape=shape)
+        else:
+            return coo_matrix((data, (rows, cols)), shape=shape, dtype=type_as.dtype)
+
+    def issparse(self, a):
+        return issparse(a)
+
+    def tocsr(self, a):
+        if self.issparse(a):
+            return a.tocsr()
+        else:
+            return csr_matrix(a)
+
+    def eliminate_zeros(self, a, threshold=0.):
+        if threshold > 0:
+            if self.issparse(a):
+                a.data[self.abs(a.data) <= threshold] = 0
+            else:
+                a[self.abs(a) <= threshold] = 0
+        if self.issparse(a):
+            a.eliminate_zeros()
+        return a
+
+    def todense(self, a):
+        if self.issparse(a):
+            return a.toarray()
+        else:
+            return a
+
+    def where(self, condition, x, y):
+        return np.where(condition, x, y)
+
+    def copy(self, a):
+        return a.copy()
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        return np.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
 
 class JaxBackend(Backend):
     """
@@ -889,6 +1010,48 @@ class JaxBackend(Backend):
     def reshape(self, a, shape):
         return jnp.reshape(a, shape)
 
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        # Currently, JAX does not support sparse matrices
+        data = self.to_numpy(data)
+        rows = self.to_numpy(rows)
+        cols = self.to_numpy(cols)
+        nx = NumpyBackend()
+        coo_matrix = nx.coo_matrix(data, rows, cols, shape=shape, type_as=type_as)
+        matrix = nx.todense(coo_matrix)
+        return self.from_numpy(matrix)
+
+    def issparse(self, a):
+        # Currently, JAX does not support sparse matrices
+        return False
+
+    def tocsr(self, a):
+        # Currently, JAX does not support sparse matrices
+        return a
+
+    def eliminate_zeros(self, a, threshold=0.):
+        # Currently, JAX does not support sparse matrices
+        if threshold > 0:
+            return self.where(
+                self.abs(a) <= threshold,
+                self.zeros((1,), type_as=a),
+                a
+            )
+        return a
+
+    def todense(self, a):
+        # Currently, JAX does not support sparse matrices
+        return a
+
+    def where(self, condition, x, y):
+        return jnp.where(condition, x, y)
+
+    def copy(self, a):
+        # No need to copy, JAX arrays are immutable
+        return a
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        return jnp.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
 
 class TorchBackend(Backend):
     """
@@ -999,7 +1162,7 @@ class TorchBackend(Backend):
             a = torch.tensor([float(a)], dtype=b.dtype, device=b.device)
         if isinstance(b, int) or isinstance(b, float):
             b = torch.tensor([float(b)], dtype=a.dtype, device=a.device)
-        if torch.__version__ >= '1.7.0':
+        if hasattr(torch, "maximum"):
             return torch.maximum(a, b)
         else:
             return torch.max(torch.stack(torch.broadcast_tensors(a, b)), axis=0)[0]
@@ -1009,7 +1172,7 @@ class TorchBackend(Backend):
             a = torch.tensor([float(a)], dtype=b.dtype, device=b.device)
         if isinstance(b, int) or isinstance(b, float):
             b = torch.tensor([float(b)], dtype=a.dtype, device=a.device)
-        if torch.__version__ >= '1.7.0':
+        if hasattr(torch, "minimum"):
             return torch.minimum(a, b)
         else:
             return torch.min(torch.stack(torch.broadcast_tensors(a, b)), axis=0)[0]
@@ -1129,3 +1292,50 @@ class TorchBackend(Backend):
 
     def reshape(self, a, shape):
         return torch.reshape(a, shape)
+
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        if type_as is None:
+            return torch.sparse_coo_tensor(torch.stack([rows, cols]), data, size=shape)
+        else:
+            return torch.sparse_coo_tensor(
+                torch.stack([rows, cols]), data, size=shape,
+                dtype=type_as.dtype, device=type_as.device
+            )
+
+    def issparse(self, a):
+        return getattr(a, "is_sparse", False) or getattr(a, "is_sparse_csr", False)
+
+    def tocsr(self, a):
+        # Versions older than 1.9 do not support CSR tensors. PyTorch 1.9 and 1.10 offer a very limited support
+        return self.todense(a)
+
+    def eliminate_zeros(self, a, threshold=0.):
+        if self.issparse(a):
+            if threshold > 0:
+                mask = self.abs(a) <= threshold
+                mask = ~mask
+                mask = mask.nonzero()
+            else:
+                mask = a._values().nonzero()
+            nv = a._values().index_select(0, mask.view(-1))
+            ni = a._indices().index_select(1, mask.view(-1))
+            return self.coo_matrix(nv, ni[0], ni[1], shape=a.shape, type_as=a)
+        else:
+            if threshold > 0:
+                a[self.abs(a) <= threshold] = 0
+            return a
+
+    def todense(self, a):
+        if self.issparse(a):
+            return a.to_dense()
+        else:
+            return a
+
+    def where(self, condition, x, y):
+        return torch.where(condition, x, y)
+
+    def copy(self, a):
+        return torch.clone(a)
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        return torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
diff --git a/ot/bregman.py b/ot/bregman.py
index 2aa76ff..0499b8e 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -32,13 +32,14 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
+
+             \gamma &\geq 0
 
-             \gamma\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -167,13 +168,14 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
+
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-        s.t. \ \gamma 1 = a
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma^T 1= b
+             \gamma &\geq 0
 
-             \gamma\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -323,13 +325,13 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -489,13 +491,13 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -550,8 +552,7 @@ def sinkhorn_log(a, b, M, reg, numItermax=1000,
     References
     ----------
 
-    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal
-    Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
+    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013
 
     .. [34] Feydy, J., Séjourné, T., Vialard, F. X., Amari, S. I., Trouvé, A., & Peyré, G. (2019, April). Interpolating between optimal transport and MMD using Sinkhorn divergences. In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 2681-2690). PMLR.
 
@@ -675,13 +676,13 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -820,13 +821,13 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -965,7 +966,7 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9,
         # remove numerical problems and store them in K
         if nx.max(nx.abs(u)) > tau or nx.max(nx.abs(v)) > tau:
             if n_hists:
-                alpha, beta = alpha + reg * nx.max(nx.log(u), 1), beta + reg * nx.max(np.log(v))
+                alpha, beta = alpha + reg * nx.max(nx.log(u), 1), beta + reg * nx.max(nx.log(v))
             else:
                 alpha, beta = alpha + reg * nx.log(u), beta + reg * nx.log(v)
                 if n_hists:
@@ -1055,13 +1056,13 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4,
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg}\cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`dim_a`, `dim_b`) metric cost matrix
@@ -1245,12 +1246,12 @@ def projC(gamma, q):
 
 def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
                stopThr=1e-4, verbose=False, log=False, **kwargs):
-    r"""Compute the entropic regularized wasserstein barycenter of distributions A
+    r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}`
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
@@ -1263,7 +1264,7 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
     Parameters
     ----------
     A : array-like, shape (dim, n_hists)
-        `n_hists` training distributions :math:`a_i` of size `dim`
+        `n_hists` training distributions :math:`\mathbf{a}_i` of size `dim`
     M : array-like, shape (dim, dim)
         loss matrix for OT
     reg : float
@@ -1271,7 +1272,7 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
     method : str (optional)
         method used for the solver either 'sinkhorn' or 'sinkhorn_stabilized'
     weights : array-like, shape (n_hists,)
-        Weights of each histogram :math:`a_i` on the simplex (barycentric coodinates)
+        Weights of each histogram :math:`\mathbf{a}_i` on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
@@ -1314,12 +1315,12 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000,
 
 def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
                         stopThr=1e-4, verbose=False, log=False):
-    r"""Compute the entropic regularized wasserstein barycenter of distributions A
+    r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}`
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
@@ -1332,13 +1333,13 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
     Parameters
     ----------
     A : array-like, shape (dim, n_hists)
-        `n_hists` training distributions :math:`a_i` of size `dim`
+        `n_hists` training distributions :math:`\mathbf{a}_i` of size `dim`
     M : array-like, shape (dim, dim)
         loss matrix for OT
     reg : float
         Regularization term > 0
     weights : array-like, shape (n_hists,)
-        Weights of each histogram :math:`a_i` on the simplex (barycentric coodinates)
+        Weights of each histogram :math:`\mathbf{a}_i` on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
@@ -1414,12 +1415,12 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000,
 
 def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
                           stopThr=1e-4, verbose=False, log=False):
-    r"""Compute the entropic regularized wasserstein barycenter of distributions A with stabilization.
+    r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}` with stabilization.
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
@@ -1432,7 +1433,7 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
     Parameters
     ----------
     A : array-like, shape (dim, n_hists)
-        `n_hists` training distributions :math:`a_i` of size `dim`
+        `n_hists` training distributions :math:`\mathbf{a}_i` of size `dim`
     M : array-like, shape (dim, dim)
         loss matrix for OT
     reg : float
@@ -1440,7 +1441,7 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
     tau : float
         threshold for max value in :math:`\mathbf{u}` or :math:`\mathbf{v}` for log scaling
     weights : array-like, shape (n_hists,)
-        Weights of each histogram :math:`a_i` on the simplex (barycentric coodinates)
+        Weights of each histogram :math:`\mathbf{a}_i` on the simplex (barycentric coodinates)
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
@@ -1533,8 +1534,8 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
                       "Or a larger absorption threshold `tau`.")
     if log:
         log['niter'] = cpt
-        log['logu'] = np.log(u + 1e-16)
-        log['logv'] = np.log(v + 1e-16)
+        log['logu'] = nx.log(u + 1e-16)
+        log['logv'] = nx.log(v + 1e-16)
         return q, log
     else:
         return q
@@ -1543,13 +1544,13 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000,
 def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
                                stopThr=1e-9, stabThr=1e-30, verbose=False,
                                log=False):
-    r"""Compute the entropic regularized wasserstein barycenter of distributions A
-    where A is a collection of 2D images.
+    r"""Compute the entropic regularized wasserstein barycenter of distributions :math:`\mathbf{A}`
+    where :math:`\mathbf{A}` is a collection of 2D images.
 
      The function solves the following optimization problem:
 
     .. math::
-       \mathbf{a} = arg\min_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
+       \mathbf{a} = \mathop{\arg \min}_\mathbf{a} \sum_i W_{reg}(\mathbf{a},\mathbf{a}_i)
 
     where :
 
@@ -1673,12 +1674,12 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000,
 
     .. math::
 
-       \mathbf{h} = arg\min_\mathbf{h} (1- \alpha)  W_{M,reg}(\mathbf{a},\mathbf{Dh})+\alpha W_{M_0,reg_0}(\mathbf{h}_0,\mathbf{h})
+       \mathbf{h} = \mathop{\arg \min}_\mathbf{h} (1- \alpha)  W_{\mathbf{M}, \mathrm{reg}}(\mathbf{a},\mathbf{Dh})+\alpha W_{\mathbf{M_0},\mathrm{reg}_0}(\mathbf{h}_0,\mathbf{h})
 
 
     where :
 
-    - :math:`W_{M,reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance with M loss matrix (see :py:func:`ot.bregman.sinkhorn`)
+    - :math:`W_{M,reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance with :math:`\mathbf{M}` loss matrix (see :py:func:`ot.bregman.sinkhorn`)
     - :math:`\mathbf{D}` is a dictionary of `n_atoms` atoms of dimension `dim_a`, its expected shape is `(dim_a, n_atoms)`
     - :math:`\mathbf{h}` is the estimated unmixing of dimension `n_atoms`
     - :math:`\mathbf{a}` is an observed distribution of dimension `dim_a`
@@ -1790,7 +1791,7 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100,
 
     .. math::
 
-        \mathbf{h} = arg\min_{\mathbf{h}}\quad \sum_{k=1}^{K} \lambda_k
+        \mathbf{h} = \mathop{\arg \min}_{\mathbf{h}} \sum_{k=1}^{K} \lambda_k
                     W_{reg}((\mathbf{D}_2^{(k)} \mathbf{h})^T, \mathbf{a})
 
         s.t. \ \forall k, \mathbf{D}_1^{(k)} \gamma_k \mathbf{1}_n= \mathbf{h}
@@ -1898,7 +1899,7 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100,
         K.append(Ktmp)
 
     # uniform target distribution
-    a = nx.from_numpy(unif(np.shape(Xt)[0]))
+    a = nx.from_numpy(unif(Xt.shape[0]), type_as=Xs[0])
 
     cpt = 0  # iterations count
     err = 1
@@ -1956,13 +1957,13 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
     The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= a
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= b
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix
@@ -2010,8 +2011,8 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
     >>> n_samples_a = 2
     >>> n_samples_b = 2
     >>> reg = 0.1
-    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
-    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> X_s = np.reshape(np.arange(n_samples_a, dtype=np.float64), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b, dtype=np.float64), (n_samples_b, 1))
     >>> empirical_sinkhorn(X_s, X_t, reg=reg, verbose=False)  # doctest: +NORMALIZE_WHITESPACE
     array([[4.99977301e-01,  2.26989344e-05],
            [2.26989344e-05,  4.99977301e-01]])
@@ -2033,9 +2034,9 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean',
 
     ns, nt = X_s.shape[0], X_t.shape[0]
     if a is None:
-        a = nx.from_numpy(unif(ns))
+        a = nx.from_numpy(unif(ns), type_as=X_s)
     if b is None:
-        b = nx.from_numpy(unif(nt))
+        b = nx.from_numpy(unif(nt), type_as=X_s)
 
     if isLazy:
         if log:
@@ -2127,13 +2128,13 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
     The function solves the following optimization problem:
 
     .. math::
-        W = \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        W = \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= a
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= b
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`n_samples_a`, `n_samples_b`) metric cost matrix
@@ -2181,8 +2182,8 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
     >>> n_samples_a = 2
     >>> n_samples_b = 2
     >>> reg = 0.1
-    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
-    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> X_s = np.reshape(np.arange(n_samples_a, dtype=np.float64), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b, dtype=np.float64), (n_samples_b, 1))
     >>> b = np.full((n_samples_b, 3), 1/n_samples_b)
     >>> empirical_sinkhorn2(X_s, X_t, b=b, reg=reg, verbose=False)
     array([4.53978687e-05, 4.53978687e-05, 4.53978687e-05])
@@ -2204,9 +2205,9 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num
 
     ns, nt = X_s.shape[0], X_t.shape[0]
     if a is None:
-        a = nx.from_numpy(unif(ns))
+        a = nx.from_numpy(unif(ns), type_as=X_s)
     if b is None:
-        b = nx.from_numpy(unif(nt))
+        b = nx.from_numpy(unif(nt), type_as=X_s)
 
     if isLazy:
         if log:
@@ -2259,32 +2260,32 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
 
     .. math::
 
-        W &= \min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        W &= \min_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot\Omega(\gamma)
 
-        W_a &= \min_{\gamma_a} <\gamma_a,M_a>_F + reg\cdot\Omega(\gamma_a)
+        W_a &= \min_{\gamma_a} <\gamma_a, \mathbf{M_a}>_F + \mathrm{reg} \cdot\Omega(\gamma_a)
 
-        W_b &= \min_{\gamma_b} <\gamma_b,M_b>_F + reg\cdot\Omega(\gamma_b)
+        W_b &= \min_{\gamma_b} <\gamma_b, \mathbf{M_b}>_F + \mathrm{reg} \cdot\Omega(\gamma_b)
 
         S &= W - \frac{W_a + W_b}{2}
 
     .. math::
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= a
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= b
 
-             \gamma\geq 0
+             \gamma &\geq 0
 
-             \gamma_a 1 = a
+             \gamma_a \mathbf{1} &= \mathbf{a}
 
-             \gamma_a^T 1= a
+             \gamma_a^T \mathbf{1} &= \mathbf{a}
 
-             \gamma_a\geq 0
+             \gamma_a &\geq 0
 
-             \gamma_b 1 = b
+             \gamma_b \mathbf{1} &= \mathbf{b}
 
-             \gamma_b^T 1= b
+             \gamma_b^T \mathbf{1} &= \mathbf{b}
 
-             \gamma_b\geq 0
+             \gamma_b &\geq 0
     where :
 
     - :math:`\mathbf{M}` (resp. :math:`\mathbf{M_a}`, :math:`\mathbf{M_b}`) is the (`n_samples_a`, `n_samples_b`) metric cost matrix (resp (`n_samples_a, n_samples_a`) and (`n_samples_b`, `n_samples_b`))
@@ -2325,8 +2326,8 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli
     >>> n_samples_a = 2
     >>> n_samples_b = 4
     >>> reg = 0.1
-    >>> X_s = np.reshape(np.arange(n_samples_a), (n_samples_a, 1))
-    >>> X_t = np.reshape(np.arange(0, n_samples_b), (n_samples_b, 1))
+    >>> X_s = np.reshape(np.arange(n_samples_a, dtype=np.float64), (n_samples_a, 1))
+    >>> X_t = np.reshape(np.arange(0, n_samples_b, dtype=np.float64), (n_samples_b, 1))
     >>> empirical_sinkhorn_divergence(X_s, X_t, reg)  # doctest: +ELLIPSIS
     1.499887176049052
 
@@ -2380,19 +2381,19 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res
 
     .. math::
 
-        (u, v) = arg\min_{u, v} 1_{ns}^T B(u,v) 1_{nt} - <\kappa u, a> - <v/\kappa, b>
+        (\mathbf{u}, \mathbf{v}) = \mathop{\arg \min}_{\mathbf{u}, \mathbf{v}} \ \mathbf{1}_{ns}^T \mathbf{B}(\mathbf{u}, \mathbf{v}) \mathbf{1}_{nt} - <\kappa \mathbf{u}, \mathbf{a}> - <\mathbf{v} / \kappa, \mathbf{b}>
 
     where:
 
     .. math::
 
-        B(u,v) = \mathrm{diag}(e^u) K \mathrm{diag}(e^v) \text{, with } K = e^{-M/reg} \text{ and}
+        \mathbf{B}(\mathbf{u}, \mathbf{v}) = \mathrm{diag}(e^\mathbf{u}) \mathbf{K} \mathrm{diag}(e^\mathbf{v}) \text{, with } \mathbf{K} = e^{-\mathbf{M} / \mathrm{reg}} \text{ and}
 
     .. math::
 
-        s.t. \ e^{u_i} \geq \epsilon / \kappa, \forall i \in \{1, \ldots, ns\}
+        s.t. \ e^{u_i} &\geq \epsilon / \kappa, \forall i \in \{1, \ldots, ns\}
 
-             e^{v_j} \geq \epsilon \kappa, \forall j \in \{1, \ldots, nt\}
+             e^{v_j} &\geq \epsilon \kappa, \forall j \in \{1, \ldots, nt\}
 
     The parameters `kappa` and `epsilon` are determined w.r.t the couple number budget of points (`ns_budget`, `nt_budget`), see Equation (5) in :ref:`[26] <references-screenkhorn>`
 
@@ -2531,7 +2532,8 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res
                 epsilon_u_square = a[0] / aK_sort[ns_budget - 1]
             else:
                 aK_sort = nx.from_numpy(
-                    bottleneck.partition(nx.to_numpy(K_sum_cols), ns_budget - 1)[ns_budget - 1]
+                    bottleneck.partition(nx.to_numpy(K_sum_cols), ns_budget - 1)[ns_budget - 1],
+                    type_as=M
                 )
                 epsilon_u_square = a[0] / aK_sort
 
@@ -2540,7 +2542,8 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res
                 epsilon_v_square = b[0] / bK_sort[nt_budget - 1]
             else:
                 bK_sort = nx.from_numpy(
-                    bottleneck.partition(nx.to_numpy(K_sum_rows), nt_budget - 1)[nt_budget - 1]
+                    bottleneck.partition(nx.to_numpy(K_sum_rows), nt_budget - 1)[nt_budget - 1],
+                    type_as=M
                 )
                 epsilon_v_square = b[0] / bK_sort
         else:
@@ -2589,10 +2592,9 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res
         K_IcJ = K[np.ix_(Ic, Jsel)]
         K_IJc = K[np.ix_(Isel, Jc)]
 
-        #K_min = K_IJ.min()
         K_min = nx.min(K_IJ)
         if K_min == 0:
-            K_min = np.finfo(float).tiny
+            K_min = float(np.finfo(float).tiny)
 
         # a_I, b_J, a_Ic, b_Jc
         a_I = a[Isel]
@@ -2713,7 +2715,7 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res
                                 maxfun=maxfun,
                                 pgtol=pgtol,
                                 maxiter=maxiter)
-    theta = nx.from_numpy(theta)
+    theta = nx.from_numpy(theta, type_as=M)
 
     usc = theta[:ns_budget]
     vsc = theta[ns_budget:]
diff --git a/ot/gromov.py b/ot/gromov.py
index 33b4453..a0fbf48 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -14,67 +14,85 @@ import numpy as np
 
 
 from .bregman import sinkhorn
-from .utils import dist, UndefinedParameter
+from .utils import dist, UndefinedParameter, list_to_array
 from .optim import cg
 from .lp import emd_1d, emd
 from .utils import check_random_state
-
-from scipy.sparse import issparse
+from .backend import get_backend
 
 
 def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
     r"""Return loss matrices and tensors for Gromov-Wasserstein fast computation
 
-    Returns the value of \mathcal{L}(C1,C2) \otimes T with the selected loss
-    function as the loss function of Gromow-Wasserstein discrepancy.
+    Returns the value of :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` with the
+    selected loss function as the loss function of Gromow-Wasserstein discrepancy.
 
-    The matrices are computed as described in Proposition 1 in [12]
+    The matrices are computed as described in Proposition 1 in :ref:`[12] <references-init-matrix>`
 
     Where :
-        * C1 : Metric cost matrix in the source space
-        * C2 : Metric cost matrix in the target space
-        * T : A coupling between those two spaces
-
-    The square-loss function L(a,b)=|a-b|^2 is read as :
-        L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
-            * f1(a)=(a^2)
-            * f2(b)=(b^2)
-            * h1(a)=a
-            * h2(b)=2*b
-
-    The kl-loss function L(a,b)=a*log(a/b)-a+b is read as :
-        L(a,b) = f1(a)+f2(b)-h1(a)*h2(b) with :
-            * f1(a)=a*log(a)-a
-            * f2(b)=b
-            * h1(a)=a
-            * h2(b)=log(b)
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{T}`: A coupling between those two spaces
+
+    The square-loss function :math:`L(a, b) = |a - b|^2` is read as :
+
+    .. math::
+
+        L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
+
+        \mathrm{with} \ f_1(a) &= a^2
+
+                        f_2(b) &= b^2
+
+                        h_1(a) &= a
+
+                        h_2(b) &= 2b
+
+    The kl-loss function :math:`L(a, b) = a \log\left(\frac{a}{b}\right) - a + b` is read as :
+
+    .. math::
+
+        L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
+
+        \mathrm{with} \ f_1(a) &= a \log(a) - a
+
+                        f_2(b) &= b
+
+                        h_1(a) &= a
+
+                        h_2(b) &= \log(b)
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    T :  ndarray, shape (ns, nt)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    T :  array-like, shape (ns, nt)
         Coupling between source and target spaces
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
 
     Returns
     -------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+
 
+    .. _references-init-matrix:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
 
     if loss_fun == 'square_loss':
         def f1(a):
@@ -90,7 +108,7 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
             return 2 * b
     elif loss_fun == 'kl_loss':
         def f1(a):
-            return a * np.log(a + 1e-15) - a
+            return a * nx.log(a + 1e-15) - a
 
         def f2(b):
             return b
@@ -99,12 +117,16 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
             return a
 
         def h2(b):
-            return np.log(b + 1e-15)
-
-    constC1 = np.dot(np.dot(f1(C1), p.reshape(-1, 1)),
-                     np.ones(len(q)).reshape(1, -1))
-    constC2 = np.dot(np.ones(len(p)).reshape(-1, 1),
-                     np.dot(q.reshape(1, -1), f2(C2).T))
+            return nx.log(b + 1e-15)
+
+    constC1 = nx.dot(
+        nx.dot(f1(C1), nx.reshape(p, (-1, 1))),
+        nx.ones((1, len(q)), type_as=q)
+    )
+    constC2 = nx.dot(
+        nx.ones((len(p), 1), type_as=p),
+        nx.dot(nx.reshape(q, (1, -1)), f2(C2).T)
+    )
     constC = constC1 + constC2
     hC1 = h1(C1)
     hC2 = h2(C2)
@@ -115,30 +137,37 @@ def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
 def tensor_product(constC, hC1, hC2, T):
     r"""Return the tensor for Gromov-Wasserstein fast computation
 
-    The tensor is computed as described in Proposition 1 Eq. (6) in [12].
+    The tensor is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-tensor-product>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
 
     Returns
     -------
-    tens : ndarray, shape (ns, nt)
-        \mathcal{L}(C1,C2) \otimes T tensor-matrix multiplication result
+    tens : array-like, shape (`ns`, `nt`)
+        :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` tensor-matrix multiplication result
+
 
+    .. _references-tensor-product:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
-    A = -np.dot(hC1, T).dot(hC2.T)
+    constC, hC1, hC2, T = list_to_array(constC, hC1, hC2, T)
+    nx = get_backend(constC, hC1, hC2, T)
+
+    A = - nx.dot(
+        nx.dot(hC1, T), hC2.T
+    )
     tens = constC + A
     # tens -= tens.min()
     return tens
@@ -147,27 +176,29 @@ def tensor_product(constC, hC1, hC2, T):
 def gwloss(constC, hC1, hC2, T):
     """Return the Loss for Gromov-Wasserstein
 
-    The loss is computed as described in Proposition 1 Eq. (6) in [12].
+    The loss is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-gwloss>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
-    T : ndarray, shape (ns, nt)
-        Current value of transport matrix T
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+    T : array-like, shape (ns, nt)
+        Current value of transport matrix :math:`\mathbf{T}`
 
     Returns
     -------
     loss : float
         Gromov Wasserstein loss
 
+
+    .. _references-gwloss:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -175,33 +206,38 @@ def gwloss(constC, hC1, hC2, T):
 
     tens = tensor_product(constC, hC1, hC2, T)
 
-    return np.sum(tens * T)
+    tens, T = list_to_array(tens, T)
+    nx = get_backend(tens, T)
+
+    return nx.sum(tens * T)
 
 
 def gwggrad(constC, hC1, hC2, T):
     """Return the gradient for Gromov-Wasserstein
 
-    The gradient is computed as described in Proposition 2 in [12].
+    The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>`
 
     Parameters
     ----------
-    constC : ndarray, shape (ns, nt)
-        Constant C matrix in Eq. (6)
-    hC1 : ndarray, shape (ns, ns)
-        h1(C1) matrix in Eq. (6)
-    hC2 : ndarray, shape (nt, nt)
-        h2(C) matrix in Eq. (6)
-    T : ndarray, shape (ns, nt)
-        Current value of transport matrix T
+    constC : array-like, shape (ns, nt)
+        Constant :math:`\mathbf{C}` matrix in Eq. (6)
+    hC1 : array-like, shape (ns, ns)
+        :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
+    hC2 : array-like, shape (nt, nt)
+        :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
+    T : array-like, shape (ns, nt)
+        Current value of transport matrix :math:`\mathbf{T}`
 
     Returns
     -------
-    grad : ndarray, shape (ns, nt)
+    grad : array-like, shape (`ns`, `nt`)
            Gromov Wasserstein gradient
 
+
+    .. _references-gwggrad:
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -212,88 +248,107 @@ def gwggrad(constC, hC1, hC2, T):
 
 def update_square_loss(p, lambdas, T, Cs):
     """
-    Updates C according to the L2 Loss kernel with the S Ts couplings
-    calculated at each iteration
+    Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s`
+    couplings calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         Masses in the targeted barycenter.
     lambdas : list of float
-        List of the S spaces' weights.
-    T : list of S np.ndarray of shape (ns,N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape(ns,ns)
+        List of the `S` spaces' weights.
+    T : list of S array-like of shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape(ns,ns)
         Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (nt, nt)
-        Updated C matrix.
+    C : array-like, shape (`nt`, `nt`)
+        Updated :math:`\mathbf{C}` matrix.
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
-                  for s in range(len(T))])
-    ppt = np.outer(p, p)
+    T = list_to_array(*T)
+    Cs = list_to_array(*Cs)
+    p = list_to_array(p)
+    nx = get_backend(p, *T, *Cs)
+
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
 
-    return np.divide(tmpsum, ppt)
+    return tmpsum / ppt
 
 
 def update_kl_loss(p, lambdas, T, Cs):
     """
-    Updates C according to the KL Loss kernel with the S Ts couplings calculated at each iteration
+    Updates :math:`\mathbf{C}` according to the KL Loss kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
 
 
     Parameters
     ----------
-    p  : ndarray, shape (N,)
+    p  : array-like, shape (N,)
         Weights in the targeted barycenter.
-    lambdas : list of the S spaces' weights
-    T : list of S np.ndarray of shape (ns,N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape(ns,ns)
+    lambdas : list of float
+        List of the `S` spaces' weights
+    T : list of S array-like of shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape(ns,ns)
         Metric cost matrices.
 
     Returns
     ----------
-    C : ndarray, shape (ns,ns)
-        updated C matrix
+    C : array-like, shape (`ns`, `ns`)
+        updated :math:`\mathbf{C}` matrix
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s])
-                  for s in range(len(T))])
-    ppt = np.outer(p, p)
+    Cs = list_to_array(*Cs)
+    T = list_to_array(*T)
+    p = list_to_array(p)
+    nx = get_backend(p, *T, *Cs)
 
-    return np.exp(np.divide(tmpsum, ppt))
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
+
+    return nx.exp(tmpsum / ppt)
 
 
 def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
     r"""
-    Returns the gromov-wasserstein transport between (C1,p) and (C2,q)
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p : ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p : array-like, shape (ns,)
         Distribution in the source space
-    q : ndarray, shape (nt,)
+    q : array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str
         loss function used for the solver either 'square_loss' or 'kl_loss'
-
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -303,22 +358,23 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
     log : bool, optional
         record log if True
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
     **kwargs : dict
         parameters can be directly passed to the ot.optim.cg solver
 
     Returns
     -------
-    T : ndarray, shape (ns, nt)
-        Doupling between the two spaces that minimizes:
-            \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+    T : array-like, shape (`ns`, `nt`)
+        Coupling between the two spaces that minimizes:
+
+            :math:`\sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}`
     log : dict
         Convergence information and loss.
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -327,6 +383,7 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
         mathematics 11.4 (2011): 417-487.
 
     """
+    p, q = list_to_array(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -348,29 +405,30 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs
 
 def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
     r"""
-    Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
+    Returns the gromov-wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
+    C2 : array-like, shape (nt, nt)
         Metric cost matrix in the target space
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
         Distribution in the source space.
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space.
     loss_fun :  str
         loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -383,8 +441,8 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
     log : bool, optional
         record log if True
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
 
     Returns
     -------
@@ -395,7 +453,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -404,6 +462,7 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
         mathematics 11.4 (2011): 417-487.
 
     """
+    p, q = list_to_array(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -425,42 +484,45 @@ def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwarg
 
 def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     r"""
-    Computes the FGW transport between two graphs see [24]
+    Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
 
     .. math::
-        \gamma = arg\min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
-        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \gamma = \mathop{\arg \min}_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
 
-        s.t. \gamma 1 = p
-             \gamma^T 1= q
-             \gamma\geq 0
+             \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{\gamma} &\geq 0
 
     where :
-    - M is the (ns,nt) metric cost matrix
-    - p and q are source and target weights (sum to 1)
-    - L is a loss function to account for the misfit between the similarity matrices
 
-    The algorithm used for solving the problem is conditional gradient as discussed in  [24]_
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
+    - `L` is a loss function to account for the misfit between the similarity matrices
+
+    The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>`
 
     Parameters
     ----------
-    M : ndarray, shape (ns, nt)
+    M : array-like, shape (ns, nt)
         Metric cost matrix between features across domains
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix representative of the structure in the source space
-    C2 : ndarray, shape (nt, nt)
+    C2 : array-like, shape (nt, nt)
         Metric cost matrix representative of the structure in the target space
-    p : ndarray, shape (ns,)
+    p : array-like, shape (ns,)
         Distribution in the source space
-    q : ndarray, shape (nt,)
+    q : array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str, optional
         Loss function used for the solver
     alpha : float, optional
         Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research. Else closed form is used.
-        If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research. Else closed form is used.
+        If there are convergence issues use False.
     log : bool, optional
         record log if True
     **kwargs : dict
@@ -468,18 +530,21 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
     Returns
     -------
-    gamma : ndarray, shape (ns, nt)
+    gamma : array-like, shape (`ns`, `nt`)
         Optimal transportation matrix for the given parameters.
     log : dict
         Log dictionary return only if log==True in parameters.
 
+
+    .. _references-fused-gromov-wasserstein:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas "Optimal Transport for structured data with
         application on graphs", International Conference on Machine Learning
         (ICML). 2019.
     """
+    p, q = list_to_array(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -501,61 +566,67 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5,
 
 def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
     r"""
-    Computes the FGW distance between two graphs see [24]
+    Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
 
     .. math::
-        \min_\gamma (1-\\alpha)*<\gamma,M>_F + \\alpha* \sum_{i,j,k,l}
-        L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+        \min_\gamma (1 - \alpha) <\gamma, \mathbf{M}>_F + \alpha \sum_{i,j,k,l}
+        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
+        s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
 
-        s.t. \gamma 1 = p
-             \gamma^T 1= q
-             \gamma\geq 0
+             \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
+
+             \mathbf{\gamma} &\geq 0
 
     where :
-    - M is the (ns,nt) metric cost matrix
-    - p and q are source and target weights (sum to 1)
-    - L is a loss function to account for the misfit between the similarity matrices
-    The algorithm used for solving the problem is conditional gradient as discussed in  [1]_
+
+    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
+    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
+    - `L` is a loss function to account for the misfit between the similarity matrices
+
+    The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein2>`
 
     Parameters
     ----------
-    M : ndarray, shape (ns, nt)
+    M : array-like, shape (ns, nt)
         Metric cost matrix between features across domains
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix respresentative of the structure in the source space.
-    C2 : ndarray, shape (nt, nt)
+    C2 : array-like, shape (nt, nt)
         Metric cost matrix espresentative of the structure in the target space.
-    p :  ndarray, shape (ns,)
+    p :  array-like, shape (ns,)
         Distribution in the source space.
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space.
     loss_fun : str, optional
         Loss function used for the solver.
     alpha : float, optional
         Trade-off parameter (0 < alpha < 1)
     armijo : bool, optional
-        If True the steps of the line-search is found via an armijo research.
-        Else closed form is used. If there is convergence issues use False.
+        If True the step of the line-search is found via an armijo research.
+        Else closed form is used. If there are convergence issues use False.
     log : bool, optional
         Record log if True.
     **kwargs : dict
-        Parameters can be directly pased to the ot.optim.cg solver.
+        Parameters can be directly passed to the ot.optim.cg solver.
 
     Returns
     -------
-    gamma : ndarray, shape (ns, nt)
+    gamma : array-like, shape (ns, nt)
         Optimal transportation matrix for the given parameters.
     log : dict
         Log dictionary return only if log==True in parameters.
 
+
+    .. _references-fused-gromov-wasserstein2:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
+    p, q = list_to_array(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -579,60 +650,64 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5
 def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
                            nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
     r"""
-        Returns an approximation of the gromov-wasserstein cost between (C1,p) and (C2,q)
-        with a fixed transport plan T.
-
-        The function gives an unbiased approximation of the following equation:
-
-        .. math::
-            GW = \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
-
-        Where :
-
-        - C1 : Metric cost matrix in the source space
-        - C2 : Metric cost matrix in the target space
-        - L  : Loss function to account for the misfit between the similarity matrices
-        - T  : Matrix with marginal p and q
-
-        Parameters
-        ----------
-        C1 : ndarray, shape (ns, ns)
-            Metric cost matrix in the source space
-        C2 : ndarray, shape (nt, nt)
-            Metric costfr matrix in the target space
-        p :  ndarray, shape (ns,)
-            Distribution in the source space
-        q :  ndarray, shape (nt,)
-            Distribution in the target space
-        loss_fun :  function: \mathcal{R} \times \mathcal{R} \shortarrow \mathcal{R}
-            Loss function used for the distance, the transport plan does not depend on the loss function
-        T : csr or ndarray, shape (ns, nt)
-            Transport plan matrix, either a sparse csr matrix or
-        nb_samples_p : int, optional
-            nb_samples_p is the number of samples (without replacement) along the first dimension of T.
-        nb_samples_q : int, optional
-            nb_samples_q is the number of samples along the second dimension of T, for each sample along the first.
-        std : bool, optional
-            Standard deviation associated with the prediction of the gromov-wasserstein cost.
-        random_state : int or RandomState instance, optional
-            Fix the seed for to allow reproducibility
-
-        Returns
-        -------
-         : float
-            Gromov-wasserstein cost
-
-        References
-        ----------
-        .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
-            "Sampled Gromov Wasserstein."
-            Machine Learning Journal (MLJ). 2021.
-
-        """
+    Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    with a fixed transport plan :math:`\mathbf{T}`.
+
+    The function gives an unbiased approximation of the following equation:
+
+    .. math::
+
+        GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - `L` : Loss function to account for the misfit between the similarity matrices
+    - :math:`\mathbf{T}`: Matrix with marginal :math:`\mathbf{p}` and :math:`\mathbf{q}`
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    T : csr or array-like, shape (ns, nt)
+        Transport plan matrix, either a sparse csr or a dense matrix
+    nb_samples_p : int, optional
+        `nb_samples_p` is the number of samples (without replacement) along the first dimension of :math:`\mathbf{T}`
+    nb_samples_q : int, optional
+        `nb_samples_q` is the number of samples along the second dimension of :math:`\mathbf{T}`, for each sample along the first
+    std : bool, optional
+        Standard deviation associated with the prediction of the gromov-wasserstein cost
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    : float
+        Gromov-wasserstein cost
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
+    """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
     generator = check_random_state(random_state)
 
-    len_p = len(p)
-    len_q = len(q)
+    len_p = p.shape[0]
+    len_q = q.shape[0]
 
     # It is always better to sample from the biggest distribution first.
     if len_p < len_q:
@@ -642,7 +717,7 @@ def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
         T = T.T
 
     if nb_samples_p is None:
-        if issparse(T):
+        if nx.issparse(T):
             # If T is sparse, it probably mean that PoGroW was used, thus the number of sample is reduced
             nb_samples_p = min(int(5 * (len_p * np.log(len_p)) ** 0.5), len_p)
         else:
@@ -657,100 +732,112 @@ def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
 
     index_k = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
     index_l = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
-    list_value_sample = np.zeros((nb_samples_p, nb_samples_p, nb_samples_q))
 
     index_i = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
     index_j = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
 
     for i in range(nb_samples_p):
-        if issparse(T):
-            T_indexi = T[index_i[i], :].toarray()[0]
-            T_indexj = T[index_j[i], :].toarray()[0]
+        if nx.issparse(T):
+            T_indexi = nx.reshape(nx.todense(T[index_i[i], :]), (-1,))
+            T_indexj = nx.reshape(nx.todense(T[index_j[i], :]), (-1,))
         else:
             T_indexi = T[index_i[i], :]
             T_indexj = T[index_j[i], :]
         # For each of the row sampled, the column is sampled.
-        index_k[i] = generator.choice(len_q, size=nb_samples_q, p=T_indexi / T_indexi.sum(), replace=True)
-        index_l[i] = generator.choice(len_q, size=nb_samples_q, p=T_indexj / T_indexj.sum(), replace=True)
-
-    for n in range(nb_samples_q):
-        list_value_sample[:, :, n] = loss_fun(C1[np.ix_(index_i, index_j)], C2[np.ix_(index_k[:, n], index_l[:, n])])
+        index_k[i] = generator.choice(
+            len_q,
+            size=nb_samples_q,
+            p=T_indexi / nx.sum(T_indexi),
+            replace=True
+        )
+        index_l[i] = generator.choice(
+            len_q,
+            size=nb_samples_q,
+            p=T_indexj / nx.sum(T_indexj),
+            replace=True
+        )
+
+    list_value_sample = nx.stack([
+        loss_fun(
+            C1[np.ix_(index_i, index_j)],
+            C2[np.ix_(index_k[:, n], index_l[:, n])]
+        ) for n in range(nb_samples_q)
+    ], axis=2)
 
     if std:
-        std_value = np.sum(np.std(list_value_sample, axis=2) ** 2) ** 0.5
-        return np.mean(list_value_sample), std_value / (nb_samples_p * nb_samples_p)
+        std_value = nx.sum(nx.std(list_value_sample, axis=2) ** 2) ** 0.5
+        return nx.mean(list_value_sample), std_value / (nb_samples_p * nb_samples_p)
     else:
-        return np.mean(list_value_sample)
+        return nx.mean(list_value_sample)
 
 
 def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
                                  alpha=1, max_iter=100, threshold_plan=0, log=False, verbose=False, random_state=None):
     r"""
-        Returns the gromov-wasserstein transport between (C1,p) and (C2,q) using a stochastic Frank-Wolfe.
-        This method as a O(max_iter \times PN^2) time complexity with P the number of Sinkhorn iterations.
-
-        The function solves the following optimization problem:
-
-        .. math::
-            GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
-
-            s.t. T 1 = p
-
-                 T^T 1= q
-
-                 T\geq 0
-
-        Where :
-
-        - C1 : Metric cost matrix in the source space
-        - C2 : Metric cost matrix in the target space
-        - p  : distribution in the source space
-        - q  : distribution in the target space
-        - L  : loss function to account for the misfit between the similarity matrices
-
-        Parameters
-        ----------
-        C1 : ndarray, shape (ns, ns)
-            Metric cost matrix in the source space
-        C2 : ndarray, shape (nt, nt)
-            Metric costfr matrix in the target space
-        p :  ndarray, shape (ns,)
-            Distribution in the source space
-        q :  ndarray, shape (nt,)
-            Distribution in the target space
-        loss_fun :  function: \mathcal{R} \times \mathcal{R} \shortarrow \mathcal{R}
-            Loss function used for the distance, the transport plan does not depend on the loss function
-        alpha : float
-            Step of the Frank-Wolfe algorithm, should be between 0 and 1
-        max_iter : int, optional
-            Max number of iterations
-        threshold_plan : float, optional
-            Deleting very small value in the transport plan. If above zero, it violate the marginal constraints.
-        verbose : bool, optional
-            Print information along iterations
-        log : bool, optional
-            Gives the distance estimated and the standard deviation
-        random_state : int or RandomState instance, optional
-            Fix the seed for to allow reproducibility
-
-        Returns
-        -------
-        T : ndarray, shape (ns, nt)
-            Optimal coupling between the two spaces
-
-        References
-        ----------
-        .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
-            "Sampled Gromov Wasserstein."
-            Machine Learning Journal (MLJ). 2021.
-
-        """
-    C1 = np.asarray(C1, dtype=np.float64)
-    C2 = np.asarray(C2, dtype=np.float64)
-    p = np.asarray(p, dtype=np.float64)
-    q = np.asarray(q, dtype=np.float64)
-    len_p = len(p)
-    len_q = len(q)
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a stochastic Frank-Wolfe.
+    This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times PN^2)` time complexity with `P` the number of Sinkhorn iterations.
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+                \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+                \mathbf{T} &\geq 0
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    alpha : float
+        Step of the Frank-Wolfe algorithm, should be between 0 and 1
+    max_iter : int, optional
+        Max number of iterations
+    threshold_plan : float, optional
+        Deleting very small values in the transport plan. If above zero, it violates the marginal constraints.
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Gives the distance estimated and the standard deviation
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two spaces
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
+    """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
+    len_p = p.shape[0]
+    len_q = q.shape[0]
 
     generator = check_random_state(random_state)
 
@@ -759,30 +846,35 @@ def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
     # Initialize with default marginal
     index[0] = generator.choice(len_p, size=1, p=p)
     index[1] = generator.choice(len_q, size=1, p=q)
-    T = emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False).tocsr()
+    T = nx.tocsr(emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False))
 
     best_gw_dist_estimated = np.inf
     for cpt in range(max_iter):
         index[0] = generator.choice(len_p, size=1, p=p)
-        T_index0 = T[index[0], :].toarray()[0]
+        T_index0 = nx.reshape(nx.todense(T[index[0], :]), (-1,))
         index[1] = generator.choice(len_q, size=1, p=T_index0 / T_index0.sum())
 
         if alpha == 1:
-            T = emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False).tocsr()
+            T = nx.tocsr(
+                emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
+            )
         else:
-            new_T = emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False).tocsr()
+            new_T = nx.tocsr(
+                emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
+            )
             T = (1 - alpha) * T + alpha * new_T
-            # To limit the number of non 0, the values bellow the threshold are set to 0.
-            T.data[T.data < threshold_plan] = 0
-            T.eliminate_zeros()
+            # To limit the number of non 0, the values below the threshold are set to 0.
+            T = nx.eliminate_zeros(T, threshold=threshold_plan)
 
         if cpt % 10 == 0 or cpt == (max_iter - 1):
-            gw_dist_estimated = GW_distance_estimation(C1=C1, C2=C2, loss_fun=loss_fun,
-                                                       p=p, q=q, T=T, std=False, random_state=generator)
+            gw_dist_estimated = GW_distance_estimation(
+                C1=C1, C2=C2, loss_fun=loss_fun,
+                p=p, q=q, T=T, std=False, random_state=generator
+            )
 
             if gw_dist_estimated < best_gw_dist_estimated:
                 best_gw_dist_estimated = gw_dist_estimated
-                best_T = T.copy()
+                best_T = nx.copy(T)
 
             if verbose:
                 if cpt % 200 == 0:
@@ -791,9 +883,10 @@ def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
 
     if log:
         log = {}
-        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(C1=C1, C2=C2, loss_fun=loss_fun,
-                                                                              p=p, q=q, T=best_T,
-                                                                              random_state=generator)
+        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
+            C1=C1, C2=C2, loss_fun=loss_fun,
+            p=p, q=q, T=best_T, random_state=generator
+        )
         return best_T, log
     return best_T
 
@@ -802,71 +895,70 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
                                nb_samples_grad=100, epsilon=1, max_iter=500, log=False, verbose=False,
                                random_state=None):
     r"""
-        Returns the gromov-wasserstein transport between (C1,p) and (C2,q) using a 1-stochastic Frank-Wolfe.
-        This method as a O(max_iter \times Nlog(N)) time complexity by relying on the 1D Optimal Transport solver.
-
-        The function solves the following optimization problem:
-
-        .. math::
-            GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
-
-            s.t. T 1 = p
-
-                 T^T 1= q
-
-                 T\geq 0
-
-        Where :
-
-        - C1 : Metric cost matrix in the source space
-        - C2 : Metric cost matrix in the target space
-        - p  : distribution in the source space
-        - q  : distribution in the target space
-        - L  : loss function to account for the misfit between the similarity matrices
-
-        Parameters
-        ----------
-        C1 : ndarray, shape (ns, ns)
-            Metric cost matrix in the source space
-        C2 : ndarray, shape (nt, nt)
-            Metric costfr matrix in the target space
-        p :  ndarray, shape (ns,)
-            Distribution in the source space
-        q :  ndarray, shape (nt,)
-            Distribution in the target space
-        loss_fun :  function: \mathcal{R} \times \mathcal{R} \shortarrow \mathcal{R}
-            Loss function used for the distance, the transport plan does not depend on the loss function
-        nb_samples_grad : int
-            Number of samples to approximate the gradient
-        epsilon : float
-            Weight of the Kullback-Leiber regularization
-        max_iter : int, optional
-            Max number of iterations
-        verbose : bool, optional
-            Print information along iterations
-        log : bool, optional
-            Gives the distance estimated and the standard deviation
-        random_state : int or RandomState instance, optional
-            Fix the seed for to allow reproducibility
-
-        Returns
-        -------
-        T : ndarray, shape (ns, nt)
-            Optimal coupling between the two spaces
-
-        References
-        ----------
-        .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
-            "Sampled Gromov Wasserstein."
-            Machine Learning Journal (MLJ). 2021.
-
-        """
-    C1 = np.asarray(C1, dtype=np.float64)
-    C2 = np.asarray(C2, dtype=np.float64)
-    p = np.asarray(p, dtype=np.float64)
-    q = np.asarray(q, dtype=np.float64)
-    len_p = len(p)
-    len_q = len(q)
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a 1-stochastic Frank-Wolfe.
+    This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times N \log(N))` time complexity by relying on the 1D Optimal Transport solver.
+
+    The function solves the following optimization problem:
+
+    .. math::
+        \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
+
+                \mathbf{T}^T \mathbf{1} &= \mathbf{q}
+
+                \mathbf{T} &\geq 0
+
+    Where :
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+
+    Parameters
+    ----------
+    C1 : array-like, shape (ns, ns)
+        Metric cost matrix in the source space
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
+        Distribution in the source space
+    q :  array-like, shape (nt,)
+        Distribution in the target space
+    loss_fun :  function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
+        Loss function used for the distance, the transport plan does not depend on the loss function
+    nb_samples_grad : int
+        Number of samples to approximate the gradient
+    epsilon : float
+        Weight of the Kullback-Leibler regularization
+    max_iter : int, optional
+        Max number of iterations
+    verbose : bool, optional
+        Print information along iterations
+    log : bool, optional
+        Gives the distance estimated and the standard deviation
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
+
+    Returns
+    -------
+    T : array-like, shape (`ns`, `nt`)
+        Optimal coupling between the two spaces
+
+    References
+    ----------
+    .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
+        "Sampled Gromov Wasserstein."
+        Machine Learning Journal (MLJ). 2021.
+
+    """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
+
+    len_p = p.shape[0]
+    len_q = q.shape[0]
 
     generator = check_random_state(random_state)
 
@@ -880,12 +972,12 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
             nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad, 1
     else:
         nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad
-    T = np.outer(p, q)
+    T = nx.outer(p, q)
     # continue_loop allows to stop the loop if there is several successive small modification of T.
     continue_loop = 0
 
     # The gradient of GW is more complex if the two matrices are not symmetric.
-    C_are_symmetric = np.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and np.allclose(C2, C2.T, rtol=1e-10, atol=1e-10)
+    C_are_symmetric = nx.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and nx.allclose(C2, C2.T, rtol=1e-10, atol=1e-10)
 
     for cpt in range(max_iter):
         index0 = generator.choice(len_p, size=nb_samples_grad_p, p=p, replace=False)
@@ -893,28 +985,30 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
         for i, index0_i in enumerate(index0):
             index1 = generator.choice(len_q,
                                       size=nb_samples_grad_q,
-                                      p=T[index0_i, :] / T[index0_i, :].sum(),
+                                      p=T[index0_i, :] / nx.sum(T[index0_i, :]),
                                       replace=False)
             # If the matrices C are not symmetric, the gradient has 2 terms, thus the term is chosen randomly.
             if (not C_are_symmetric) and generator.rand(1) > 0.5:
-                Lik += np.mean(loss_fun(np.expand_dims(C1[:, np.repeat(index0[i], nb_samples_grad_q)], 1),
-                                        np.expand_dims(C2[:, index1], 0)),
-                               axis=2)
+                Lik += nx.mean(loss_fun(
+                    C1[:, [index0[i]] * nb_samples_grad_q][:, None, :],
+                    C2[:, index1][None, :, :]
+                ), axis=2)
             else:
-                Lik += np.mean(loss_fun(np.expand_dims(C1[np.repeat(index0[i], nb_samples_grad_q), :], 2),
-                                        np.expand_dims(C2[index1, :], 1)),
-                               axis=0)
+                Lik += nx.mean(loss_fun(
+                    C1[[index0[i]] * nb_samples_grad_q, :][:, :, None],
+                    C2[index1, :][:, None, :]
+                ), axis=0)
 
-        max_Lik = np.max(Lik)
+        max_Lik = nx.max(Lik)
         if max_Lik == 0:
             continue
         # This division by the max is here to facilitate the choice of epsilon.
         Lik /= max_Lik
 
         if epsilon > 0:
-            # Set to infinity all the numbers bellow exp(-200) to avoid log of 0.
-            log_T = np.log(np.clip(T, np.exp(-200), 1))
-            log_T[log_T == -200] = -np.inf
+            # Set to infinity all the numbers below exp(-200) to avoid log of 0.
+            log_T = nx.log(nx.clip(T, np.exp(-200), 1))
+            log_T = nx.where(log_T == -200, -np.inf, log_T)
             Lik = Lik - epsilon * log_T
 
             try:
@@ -925,11 +1019,11 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
         else:
             new_T = emd(a=p, b=q, M=Lik)
 
-        change_T = ((T - new_T) ** 2).mean()
+        change_T = nx.mean((T - new_T) ** 2)
         if change_T <= 10e-20:
             continue_loop += 1
             if continue_loop > 100:  # Number max of low modifications of T
-                T = new_T.copy()
+                T = nx.copy(new_T)
                 break
         else:
             continue_loop = 0
@@ -938,12 +1032,14 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
             if cpt % 200 == 0:
                 print('{:5s}|{:12s}'.format('It.', '||T_n - T_{n+1}||') + '\n' + '-' * 19)
             print('{:5d}|{:8e}|'.format(cpt, change_T))
-        T = new_T.copy()
+        T = nx.copy(new_T)
 
     if log:
         log = {}
-        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(C1=C1, C2=C2, loss_fun=loss_fun,
-                                                                              p=p, q=q, T=T, random_state=generator)
+        log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
+            C1=C1, C2=C2, loss_fun=loss_fun,
+            p=p, q=q, T=T, random_state=generator
+        )
         return T, log
     return T
 
@@ -951,38 +1047,37 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
 def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
                                 max_iter=1000, tol=1e-9, verbose=False, log=False):
     r"""
-    Returns the gromov-wasserstein transport between (C1,p) and (C2,q)
-
-    (C1,p) and (C2,q)
+    Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = arg\min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
 
-        s.t. T 1 = p
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
-             T^T 1= q
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
-             T\geq 0
+             \mathbf{T} &\geq 0
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+    - `H`: entropy
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
         Distribution in the source space
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space
     loss_fun :  string
         Loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -999,21 +1094,20 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 
     Returns
     -------
-    T : ndarray, shape (ns, nt)
+    T : array-like, shape (`ns`, `nt`)
         Optimal coupling between the two spaces
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
+    C1, C2, p, q = list_to_array(C1, C2, p, q)
+    nx = get_backend(C1, C2, p, q)
 
-    C1 = np.asarray(C1, dtype=np.float64)
-    C2 = np.asarray(C2, dtype=np.float64)
-
-    T = np.outer(p, q)  # Initialization
+    T = nx.outer(p, q)
 
     constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
 
@@ -1035,7 +1129,7 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(T - Tprev)
+            err = nx.norm(T - Tprev)
 
             if log:
                 log['err'].append(err)
@@ -1058,32 +1152,31 @@ def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
 def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
                                  max_iter=1000, tol=1e-9, verbose=False, log=False):
     r"""
-    Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices
-
-    (C1,p) and (C2,q)
+    Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
 
     The function solves the following optimization problem:
 
     .. math::
-        GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}-\epsilon(H(T))
+        GW = \min_\mathbf{T} \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
 
     Where :
-    - C1 : Metric cost matrix in the source space
-    - C2 : Metric cost matrix in the target space
-    - p  : distribution in the source space
-    - q  : distribution in the target space
-    - L  : loss function to account for the misfit between the similarity matrices
-    - H  : entropy
+
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity matrices
+    - `H`: entropy
 
     Parameters
     ----------
-    C1 : ndarray, shape (ns, ns)
+    C1 : array-like, shape (ns, ns)
         Metric cost matrix in the source space
-    C2 : ndarray, shape (nt, nt)
-        Metric costfr matrix in the target space
-    p :  ndarray, shape (ns,)
+    C2 : array-like, shape (nt, nt)
+        Metric cost matrix in the target space
+    p :  array-like, shape (ns,)
         Distribution in the source space
-    q :  ndarray, shape (nt,)
+    q :  array-like, shape (nt,)
         Distribution in the target space
     loss_fun : str
         Loss function used for the solver either 'square_loss' or 'kl_loss'
@@ -1105,7 +1198,7 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
@@ -1122,40 +1215,39 @@ def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
 
 
 def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
-                                max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None):
+                                max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
     r"""
-    Returns the gromov-wasserstein barycenters of S measured similarity matrices
-
-    (Cs)_{s=1}^{s=S}
+    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
     The function solves the following optimization problem:
 
     .. math::
-        C = argmin_{C\in R^{NxN}} \sum_s \lambda_s GW(C,C_s,p,p_s)
 
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
-    - :math:`C_s` : metric cost matrix
-    - :math:`p_s`  : distribution
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
 
     Parameters
     ----------
     N : int
         Size of the targeted barycenter
-    Cs : list of S np.ndarray of shape (ns,ns)
+    Cs : list of S array-like of shape (ns,ns)
         Metric cost matrices
-    ps : list of S np.ndarray of shape (ns,)
-        Sample weights in the S spaces
-    p : ndarray, shape(N,)
+    ps : list of S array-like of shape (ns,)
+        Sample weights in the `S` spaces
+    p : array-like, shape(N,)
         Weights in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights.
+        List of the `S` spaces' weights.
     loss_fun : callable
         Tensor-matrix multiplication function based on specific loss function.
     update : callable
-        function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-        with the S Ts couplings calculated at each iteration
+        function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
+        :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
+        calculated at each iteration
     epsilon : float
         Regularization term >0
     max_iter : int, optional
@@ -1166,32 +1258,36 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : bool | ndarray, shape (N, N)
-        Random initial value for the C matrix provided by user.
+    init_C : bool | array-like, shape (N, N)
+        Random initial value for the :math:`\mathbf{C}` matrix provided by user.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
     """
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *ps, p)
 
     S = len(Cs)
 
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    lambdas = np.asarray(lambdas, dtype=np.float64)
-
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
-        # XXX use random state
-        xalea = np.random.randn(N, 2)
+        generator = check_random_state(random_state)
+        xalea = generator.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
+        C = nx.from_numpy(C, type_as=p)
     else:
         C = init_C
 
@@ -1214,7 +1310,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(C - Cprev)
+            err = nx.norm(C - Cprev)
             error.append(err)
 
             if log:
@@ -1232,38 +1328,39 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
 
 def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
-                       max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None):
+                       max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
     r"""
-    Returns the gromov-wasserstein barycenters of S measured similarity matrices
-
-    (Cs)_{s=1}^{s=S}
+    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
-    The function solves the following optimization problem with block
-    coordinate descent:
+    The function solves the following optimization problem with block coordinate descent:
 
     .. math::
-        C = argmin_C\in R^NxN \sum_s \lambda_s GW(C,Cs,p,ps)
+
+        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
-    - Cs : metric cost matrix
-    - ps  : distribution
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
 
     Parameters
     ----------
     N : int
         Size of the targeted barycenter
-    Cs : list of S np.ndarray of shape (ns, ns)
+    Cs : list of S array-like of shape (ns, ns)
         Metric cost matrices
-    ps : list of S np.ndarray of shape (ns,)
-        Sample weights in the S spaces
-    p : ndarray, shape (N,)
+    ps : list of S array-like of shape (ns,)
+        Sample weights in the `S` spaces
+    p : array-like, shape (N,)
         Weights in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights
-    loss_fun :  tensor-matrix multiplication function based on specific loss function
-    update : function(p,lambdas,T,Cs) that updates C according to a specific Kernel
-             with the S Ts couplings calculated at each iteration
+        List of the `S` spaces' weights
+    loss_fun : callable
+        tensor-matrix multiplication function based on specific loss function
+    update : callable
+        function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
+        :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
+        calculated at each iteration
     max_iter : int, optional
         Max number of iterations
     tol : float, optional
@@ -1272,32 +1369,37 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : bool | ndarray, shape(N,N)
-        Random initial value for the C matrix provided by user.
+    init_C : bool | array-like, shape(N,N)
+        Random initial value for the :math:`\mathbf{C}` matrix provided by user.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
 
     References
     ----------
-    .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+    .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
         "Gromov-Wasserstein averaging of kernel and distance matrices."
         International Conference on Machine Learning (ICML). 2016.
 
     """
-    S = len(Cs)
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *ps, p)
 
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    lambdas = np.asarray(lambdas, dtype=np.float64)
+    S = len(Cs)
 
     # Initialization of C : random SPD matrix (if not provided by user)
     if init_C is None:
-        # XXX : should use a random state and not use the global seed
-        xalea = np.random.randn(N, 2)
+        generator = check_random_state(random_state)
+        xalea = generator.randn(N, 2)
         C = dist(xalea, xalea)
         C /= C.max()
+        C = nx.from_numpy(C, type_as=p)
     else:
         C = init_C
 
@@ -1320,7 +1422,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         if cpt % 10 == 0:
             # we can speed up the process by checking for the error only all
             # the 10th iterations
-            err = np.linalg.norm(C - Cprev)
+            err = nx.norm(C - Cprev)
             error.append(err)
 
             if log:
@@ -1339,21 +1441,21 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
 def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
                     p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
-                    verbose=False, log=False, init_C=None, init_X=None):
-    """Compute the fgw barycenter as presented eq (5) in [24].
+                    verbose=False, log=False, init_C=None, init_X=None, random_state=None):
+    """Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>`
 
     Parameters
     ----------
-    N : integer
+    N : int
         Desired number of samples of the target barycenter
-    Ys: list of ndarray, each element has shape (ns,d)
+    Ys: list of array-like, each element has shape (ns,d)
         Features of all samples
-    Cs : list of ndarray, each element has shape (ns,ns)
+    Cs : list of array-like, each element has shape (ns,ns)
         Structure matrices of all samples
-    ps : list of ndarray, each element has shape (ns,)
+    ps : list of array-like, each element has shape (ns,)
         Masses of all samples.
     lambdas : list of float
-        List of the S spaces' weights
+        List of the `S` spaces' weights
     alpha : float
         Alpha parameter for the fgw distance
     fixed_structure : bool
@@ -1370,41 +1472,46 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
         Print information along iterations.
     log : bool, optional
         Record log if True.
-    init_C : ndarray, shape (N,N), optional
+    init_C : array-like, shape (N,N), optional
         Initialization for the barycenters' structure matrix. If not set
         a random init is used.
-    init_X : ndarray, shape (N,d), optional
+    init_X : array-like, shape (N,d), optional
         Initialization for the barycenters' features. If not set a
         random init is used.
+    random_state : int or RandomState instance, optional
+        Fix the seed for reproducibility
 
     Returns
     -------
-    X : ndarray, shape (N, d)
+    X : array-like, shape (`N`, `d`)
         Barycenters' features
-    C : ndarray, shape (N, N)
+    C : array-like, shape (`N`, `N`)
         Barycenters' structure matrix
-    log_: dict
+    log : dict
         Only returned when log=True. It contains the keys:
-        T : list of (N,ns) transport matrices
-        Ms : all distance matrices between the feature of the barycenter and the
-        other features dist(X,Ys) shape (N,ns)
 
+        - :math:`\mathbf{T}`: list of (`N`, `ns`) transport matrices
+        - :math:`(\mathbf{M}_s)_s`: all distance matrices between the feature of the barycenter and the other features :math:`(dist(\mathbf{X}, \mathbf{Y}_s))_s` shape (`N`, `ns`)
+
+
+    .. _references-fgw-barycenters:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
         and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
+    Cs = list_to_array(*Cs)
+    ps = list_to_array(*ps)
+    Ys = list_to_array(*Ys)
+    p = list_to_array(p)
+    nx = get_backend(*Cs, *Ys, *ps)
+
     S = len(Cs)
     d = Ys[0].shape[1]  # dimension on the node features
     if p is None:
-        p = np.ones(N) / N
-
-    Cs = [np.asarray(Cs[s], dtype=np.float64) for s in range(S)]
-    Ys = [np.asarray(Ys[s], dtype=np.float64) for s in range(S)]
-
-    lambdas = np.asarray(lambdas, dtype=np.float64)
+        p = nx.ones(N, type_as=Cs[0]) / N
 
     if fixed_structure:
         if init_C is None:
@@ -1413,8 +1520,10 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             C = init_C
     else:
         if init_C is None:
-            xalea = np.random.randn(N, 2)
+            generator = check_random_state(random_state)
+            xalea = generator.randn(N, 2)
             C = dist(xalea, xalea)
+            C = nx.from_numpy(C, type_as=ps[0])
         else:
             C = init_C
 
@@ -1425,13 +1534,13 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             X = init_X
     else:
         if init_X is None:
-            X = np.zeros((N, d))
+            X = nx.zeros((N, d), type_as=ps[0])
         else:
             X = init_X
 
-    T = [np.outer(p, q) for q in ps]
+    T = [nx.outer(p, q) for q in ps]
 
-    Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]  # Ms is N,ns
+    Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
 
     cpt = 0
     err_feature = 1
@@ -1451,20 +1560,19 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
             Ys_temp = [y.T for y in Ys]
             X = update_feature_matrix(lambdas, Ys_temp, T, p).T
 
-        Ms = [np.asarray(dist(X, Ys[s]), dtype=np.float64) for s in range(len(Ys))]
+        Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
 
         if not fixed_structure:
             if loss_fun == 'square_loss':
                 T_temp = [t.T for t in T]
-                C = update_sructure_matrix(p, lambdas, T_temp, Cs)
+                C = update_structure_matrix(p, lambdas, T_temp, Cs)
 
         T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
                                       numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
 
         # T is N,ns
-        err_feature = np.linalg.norm(X - Xprev.reshape(N, d))
-        err_structure = np.linalg.norm(C - Cprev)
-
+        err_feature = nx.norm(X - nx.reshape(Xprev, (N, d)))
+        err_structure = nx.norm(C - Cprev)
         if log:
             log_['err_feature'].append(err_feature)
             log_['err_structure'].append(err_structure)
@@ -1490,64 +1598,80 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_
         return X, C
 
 
-def update_sructure_matrix(p, lambdas, T, Cs):
-    """Updates C according to the L2 Loss kernel with the S Ts couplings.
+def update_structure_matrix(p, lambdas, T, Cs):
+    """Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings.
 
     It is calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         Masses in the targeted barycenter.
     lambdas : list of float
-        List of the S spaces' weights.
-    T : list of S ndarray of shape (ns, N)
-        The S Ts couplings calculated at each iteration.
-    Cs : list of S ndarray, shape (ns, ns)
-         Metric cost matrices.
+        List of the `S` spaces' weights.
+    T : list of S array-like of shape (ns, N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
+    Cs : list of S array-like, shape (ns, ns)
+        Metric cost matrices.
 
     Returns
     -------
-    C : ndarray, shape (nt, nt)
-        Updated C matrix.
+    C : array-like, shape (`nt`, `nt`)
+        Updated :math:`\mathbf{C}` matrix.
     """
-    tmpsum = sum([lambdas[s] * np.dot(T[s].T, Cs[s]).dot(T[s]) for s in range(len(T))])
-    ppt = np.outer(p, p)
+    p = list_to_array(p)
+    T = list_to_array(*T)
+    Cs = list_to_array(*Cs)
+    nx = get_backend(*Cs, *T, p)
 
-    return np.divide(tmpsum, ppt)
+    tmpsum = sum([
+        lambdas[s] * nx.dot(
+            nx.dot(T[s].T, Cs[s]),
+            T[s]
+        ) for s in range(len(T))
+    ])
+    ppt = nx.outer(p, p)
+    return tmpsum / ppt
 
 
 def update_feature_matrix(lambdas, Ys, Ts, p):
-    """Updates the feature with respect to the S Ts couplings.
+    """Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings.
 
 
     See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
-    in [24] calculated at each iteration
+    in :ref:`[24] <references-update-feature-matrix>` calculated at each iteration
 
     Parameters
     ----------
-    p : ndarray, shape (N,)
+    p : array-like, shape (N,)
         masses in the targeted barycenter
     lambdas : list of float
-        List of the S spaces' weights
-    Ts : list of S np.ndarray(ns,N)
-        the S Ts couplings calculated at each iteration
-    Ys : list of S ndarray, shape(d,ns)
+        List of the `S` spaces' weights
+    Ts : list of S array-like, shape (ns,N)
+        The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
+    Ys : list of S array-like, shape (d,ns)
         The features.
 
     Returns
     -------
-    X : ndarray, shape (d, N)
+    X : array-like, shape (`d`, `N`)
+
 
+    .. _references-update-feature-matrix:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain
-          and Courty Nicolas
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
-    p = np.array(1. / p).reshape(-1,)
-
-    tmpsum = sum([lambdas[s] * np.dot(Ys[s], Ts[s].T) * p[None, :] for s in range(len(Ts))])
-
+    p = list_to_array(p)
+    Ts = list_to_array(*Ts)
+    Ys = list_to_array(*Ys)
+    nx = get_backend(*Ys, *Ts, p)
+
+    p = 1. / p
+    tmpsum = sum([
+        lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :]
+        for s in range(len(Ts))
+    ])
     return tmpsum
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index c6757d1..4e95ccf 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -691,10 +691,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
     ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
         transportation matrix)
     """
-    a = np.asarray(a, dtype=np.float64)
-    b = np.asarray(b, dtype=np.float64)
-    x_a = np.asarray(x_a, dtype=np.float64)
-    x_b = np.asarray(x_b, dtype=np.float64)
+    a, b, x_a, x_b = list_to_array(a, b, x_a, x_b)
+    nx = get_backend(x_a, x_b)
 
     assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
         "emd_1d should only be used with monodimensional data"
@@ -702,27 +700,43 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
         "emd_1d should only be used with monodimensional data"
 
     # if empty array given then use uniform distributions
-    if a.ndim == 0 or len(a) == 0:
-        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
-    if b.ndim == 0 or len(b) == 0:
-        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+    if a is None or a.ndim == 0 or len(a) == 0:
+        a = nx.ones((x_a.shape[0],), type_as=x_a) / x_a.shape[0]
+    if b is None or b.ndim == 0 or len(b) == 0:
+        b = nx.ones((x_b.shape[0],), type_as=x_b) / x_b.shape[0]
 
     # ensure that same mass
-    np.testing.assert_almost_equal(a.sum(0),b.sum(0),err_msg='a and b vector must have the same sum')
-    b=b*a.sum()/b.sum()
-
-    x_a_1d = x_a.reshape((-1,))
-    x_b_1d = x_b.reshape((-1,))
-    perm_a = np.argsort(x_a_1d)
-    perm_b = np.argsort(x_b_1d)
-
-    G_sorted, indices, cost = emd_1d_sorted(a[perm_a], b[perm_b],
-                                            x_a_1d[perm_a], x_b_1d[perm_b],
-                                            metric=metric, p=p)
-    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
-                   shape=(a.shape[0], b.shape[0]))
+    np.testing.assert_almost_equal(
+        nx.sum(a, axis=0),
+        nx.sum(b, axis=0),
+        err_msg='a and b vector must have the same sum'
+    )
+    b = b * nx.sum(a) / nx.sum(b)
+
+    x_a_1d = nx.reshape(x_a, (-1,))
+    x_b_1d = nx.reshape(x_b, (-1,))
+    perm_a = nx.argsort(x_a_1d)
+    perm_b = nx.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(
+        nx.to_numpy(a[perm_a]),
+        nx.to_numpy(b[perm_b]),
+        nx.to_numpy(x_a_1d[perm_a]),
+        nx.to_numpy(x_b_1d[perm_b]),
+        metric=metric, p=p
+    )
+
+    G = nx.coo_matrix(
+        G_sorted,
+        perm_a[indices[:, 0]],
+        perm_b[indices[:, 1]],
+        shape=(a.shape[0], b.shape[0]),
+        type_as=x_a
+    )
     if dense:
-        G = G.toarray()
+        G = nx.todense(G)
+    elif str(nx) == "jax":
+        warnings.warn("JAX does not support sparse matrices, converting to dense")
     if log:
         log = {'cost': cost}
         return G, log
diff --git a/ot/optim.py b/ot/optim.py
index 34cbb17..6456c03 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -23,7 +23,7 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
     r"""
     Armijo linesearch function that works with matrices
 
-    Find an approximate minimum of :math:`f(x_k + \\alpha \cdot p_k)` that satisfies the
+    Find an approximate minimum of :math:`f(x_k + \alpha \cdot p_k)` that satisfies the
     armijo conditions.
 
     Parameters
@@ -129,7 +129,7 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val,
     .. _references-solve-linesearch:
     References
     ----------
-    .. [24] Vayer Titouan, Chapel Laetitia, Flamary R{\'e}mi, Tavenard Romain and Courty Nicolas
+    .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas
         "Optimal Transport for structured data with application on graphs"
         International Conference on Machine Learning (ICML). 2019.
     """
@@ -162,13 +162,13 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000,
         The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + \mathrm{reg} \cdot f(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg} \cdot f(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
@@ -309,13 +309,13 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
         The function solves the following optimization problem:
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F + \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma)
+        \gamma = \mathop{\arg \min}_\gamma <\gamma, \mathbf{M}>_F + \mathrm{reg_1}\cdot\Omega(\gamma) + \mathrm{reg_2}\cdot f(\gamma)
 
-        s.t. \ \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} &= \mathbf{b}
 
-             \gamma\geq 0
+             \gamma &\geq 0
     where :
 
     - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
@@ -452,7 +452,7 @@ def solve_1d_linesearch_quad(a, b, c):
 
     .. math::
 
-        arg\min_{0 \leq x \leq 1} f(x) = ax^{2} + bx + c
+        \mathop{\arg \min}_{0 \leq x \leq 1} f(x) = ax^{2} + bx + c
 
     Parameters
     ----------
diff --git a/test/test_backend.py b/test/test_backend.py
index 5853282..0f11ace 100644
--- a/test/test_backend.py
+++ b/test/test_backend.py
@@ -207,6 +207,22 @@ def test_empty_backend():
         nx.stack([M, M])
     with pytest.raises(NotImplementedError):
         nx.reshape(M, (5, 3, 2))
+    with pytest.raises(NotImplementedError):
+        nx.coo_matrix(M, M, M)
+    with pytest.raises(NotImplementedError):
+        nx.issparse(M)
+    with pytest.raises(NotImplementedError):
+        nx.tocsr(M)
+    with pytest.raises(NotImplementedError):
+        nx.eliminate_zeros(M)
+    with pytest.raises(NotImplementedError):
+        nx.todense(M)
+    with pytest.raises(NotImplementedError):
+        nx.where(M, M, M)
+    with pytest.raises(NotImplementedError):
+        nx.copy(M)
+    with pytest.raises(NotImplementedError):
+        nx.allclose(M, M)
 
 
 def test_func_backends(nx):
@@ -216,6 +232,11 @@ def test_func_backends(nx):
     v = rnd.randn(3)
     val = np.array([1.0])
 
+    # Sparse tensors test
+    sp_row = np.array([0, 3, 1, 0, 3])
+    sp_col = np.array([0, 3, 1, 2, 2])
+    sp_data = np.array([4, 5, 7, 9, 0])
+
     lst_tot = []
 
     for nx in [ot.backend.NumpyBackend(), nx]:
@@ -229,6 +250,10 @@ def test_func_backends(nx):
         vb = nx.from_numpy(v)
         val = nx.from_numpy(val)
 
+        sp_rowb = nx.from_numpy(sp_row)
+        sp_colb = nx.from_numpy(sp_col)
+        sp_datab = nx.from_numpy(sp_data)
+
         A = nx.set_gradients(val, v, v)
         lst_b.append(nx.to_numpy(A))
         lst_name.append('set_gradients')
@@ -438,6 +463,37 @@ def test_func_backends(nx):
         lst_b.append(nx.to_numpy(A))
         lst_name.append('reshape')
 
+        sp_Mb = nx.coo_matrix(sp_datab, sp_rowb, sp_colb, shape=(4, 4))
+        nx.todense(Mb)
+        lst_b.append(nx.to_numpy(nx.todense(sp_Mb)))
+        lst_name.append('coo_matrix')
+
+        assert not nx.issparse(Mb), 'Assert fail on: issparse (expected False)'
+        assert nx.issparse(sp_Mb) or nx.__name__ == "jax", 'Assert fail on: issparse (expected True)'
+
+        A = nx.tocsr(sp_Mb)
+        lst_b.append(nx.to_numpy(nx.todense(A)))
+        lst_name.append('tocsr')
+
+        A = nx.eliminate_zeros(nx.copy(sp_datab), threshold=5.)
+        lst_b.append(nx.to_numpy(A))
+        lst_name.append('eliminate_zeros (dense)')
+
+        A = nx.eliminate_zeros(sp_Mb)
+        lst_b.append(nx.to_numpy(nx.todense(A)))
+        lst_name.append('eliminate_zeros (sparse)')
+
+        A = nx.where(Mb >= nx.stack([nx.linspace(0, 1, 10)] * 3, axis=1), Mb, 0.0)
+        lst_b.append(nx.to_numpy(A))
+        lst_name.append('where')
+
+        A = nx.copy(Mb)
+        lst_b.append(nx.to_numpy(A))
+        lst_name.append('copy')
+
+        assert nx.allclose(Mb, Mb), 'Assert fail on: allclose (expected True)'
+        assert not nx.allclose(2 * Mb, Mb), 'Assert fail on: allclose (expected False)'
+
         lst_tot.append(lst_b)
 
     lst_np = lst_tot[0]
diff --git a/test/test_bregman.py b/test/test_bregman.py
index c1120ba..6923d31 100644
--- a/test/test_bregman.py
+++ b/test/test_bregman.py
@@ -477,8 +477,8 @@ def test_lazy_empirical_sinkhorn(nx):
     b = ot.unif(n)
     numIterMax = 1000
 
-    X_s = np.reshape(np.arange(n), (n, 1))
-    X_t = np.reshape(np.arange(0, n), (n, 1))
+    X_s = np.reshape(np.arange(n, dtype=np.float64), (n, 1))
+    X_t = np.reshape(np.arange(0, n, dtype=np.float64), (n, 1))
     M = ot.dist(X_s, X_t)
     M_m = ot.dist(X_s, X_t, metric='euclidean')
 
diff --git a/test/test_gromov.py b/test/test_gromov.py
index 0242d72..509c54d 100644
--- a/test/test_gromov.py
+++ b/test/test_gromov.py
@@ -8,11 +8,12 @@
 
 import numpy as np
 import ot
+from ot.backend import NumpyBackend
 
 import pytest
 
 
-def test_gromov():
+def test_gromov(nx):
     n_samples = 50  # nb samples
 
     mu_s = np.array([0, 0])
@@ -31,37 +32,50 @@ def test_gromov():
     C1 /= C1.max()
     C2 /= C2.max()
 
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    pb = nx.from_numpy(p)
+    qb = nx.from_numpy(q)
+
     G = ot.gromov.gromov_wasserstein(C1, C2, p, q, 'square_loss', verbose=True)
+    Gb = nx.to_numpy(ot.gromov.gromov_wasserstein(C1b, C2b, pb, qb, 'square_loss', verbose=True))
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
     Id = (1 / (1.0 * n_samples)) * np.eye(n_samples, n_samples)
 
-    np.testing.assert_allclose(
-        G, np.flipud(Id), atol=1e-04)
+    np.testing.assert_allclose(Gb, np.flipud(Id), atol=1e-04)
 
     gw, log = ot.gromov.gromov_wasserstein2(C1, C2, p, q, 'kl_loss', log=True)
+    gwb, logb = ot.gromov.gromov_wasserstein2(C1b, C2b, pb, qb, 'kl_loss', log=True)
 
     gw_val = ot.gromov.gromov_wasserstein2(C1, C2, p, q, 'kl_loss', log=False)
+    gw_valb = ot.gromov.gromov_wasserstein2(C1b, C2b, pb, qb, 'kl_loss', log=False)
 
     G = log['T']
+    Gb = nx.to_numpy(logb['T'])
 
-    np.testing.assert_allclose(gw, 0, atol=1e-1, rtol=1e-1)
+    np.testing.assert_allclose(gw, gwb, atol=1e-06)
+    np.testing.assert_allclose(gwb, 0, atol=1e-1, rtol=1e-1)
 
-    np.testing.assert_allclose(gw, gw_val, atol=1e-1, rtol=1e-1)  # cf log=False
+    np.testing.assert_allclose(gw_val, gw_valb, atol=1e-06)
+    np.testing.assert_allclose(gwb, gw_valb, atol=1e-1, rtol=1e-1)  # cf log=False
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
 
-def test_entropic_gromov():
+@pytest.skip_backend("jax", reason="test very slow with jax backend")
+def test_entropic_gromov(nx):
     n_samples = 50  # nb samples
 
     mu_s = np.array([0, 0])
@@ -80,30 +94,44 @@ def test_entropic_gromov():
     C1 /= C1.max()
     C2 /= C2.max()
 
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    pb = nx.from_numpy(p)
+    qb = nx.from_numpy(q)
+
     G = ot.gromov.entropic_gromov_wasserstein(
         C1, C2, p, q, 'square_loss', epsilon=5e-4, verbose=True)
+    Gb = nx.to_numpy(ot.gromov.entropic_gromov_wasserstein(
+        C1b, C2b, pb, qb, 'square_loss', epsilon=5e-4, verbose=True
+    ))
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
     gw, log = ot.gromov.entropic_gromov_wasserstein2(
         C1, C2, p, q, 'kl_loss', epsilon=1e-2, log=True)
+    gwb, logb = ot.gromov.entropic_gromov_wasserstein2(
+        C1b, C2b, pb, qb, 'kl_loss', epsilon=1e-2, log=True)
 
     G = log['T']
+    Gb = nx.to_numpy(logb['T'])
 
+    np.testing.assert_allclose(gw, gwb, atol=1e-06)
     np.testing.assert_allclose(gw, 0, atol=1e-1, rtol=1e-1)
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
 
-def test_pointwise_gromov():
+def test_pointwise_gromov(nx):
     n_samples = 50  # nb samples
 
     mu_s = np.array([0, 0])
@@ -122,33 +150,52 @@ def test_pointwise_gromov():
     C1 /= C1.max()
     C2 /= C2.max()
 
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    pb = nx.from_numpy(p)
+    qb = nx.from_numpy(q)
+
     def loss(x, y):
         return np.abs(x - y)
 
+    def lossb(x, y):
+        return nx.abs(x - y)
+
     G, log = ot.gromov.pointwise_gromov_wasserstein(
         C1, C2, p, q, loss, max_iter=100, log=True, verbose=True, random_state=42)
+    G = NumpyBackend().todense(G)
+    Gb, logb = ot.gromov.pointwise_gromov_wasserstein(
+        C1b, C2b, pb, qb, lossb, max_iter=100, log=True, verbose=True, random_state=42)
+    Gb = nx.to_numpy(nx.todense(Gb))
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p[:, np.newaxis], G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q[np.newaxis, :], G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
-    assert log['gw_dist_estimated'] == 0.0
-    assert log['gw_dist_std'] == 0.0
+    np.testing.assert_allclose(logb['gw_dist_estimated'], 0.0, atol=1e-08)
+    np.testing.assert_allclose(logb['gw_dist_std'], 0.0, atol=1e-08)
 
     G, log = ot.gromov.pointwise_gromov_wasserstein(
         C1, C2, p, q, loss, max_iter=100, alpha=0.1, log=True, verbose=True, random_state=42)
+    G = NumpyBackend().todense(G)
+    Gb, logb = ot.gromov.pointwise_gromov_wasserstein(
+        C1b, C2b, pb, qb, lossb, max_iter=100, alpha=0.1, log=True, verbose=True, random_state=42)
+    Gb = nx.to_numpy(nx.todense(Gb))
 
-    assert log['gw_dist_estimated'] == 0.10342276348494964
-    assert log['gw_dist_std'] == 0.0015952535464736394
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
+    np.testing.assert_allclose(logb['gw_dist_estimated'], 0.10342276348494964, atol=1e-8)
+    np.testing.assert_allclose(logb['gw_dist_std'], 0.0015952535464736394, atol=1e-8)
 
 
-def test_sampled_gromov():
+@pytest.skip_backend("jax", reason="test very slow with jax backend")
+def test_sampled_gromov(nx):
     n_samples = 50  # nb samples
 
-    mu_s = np.array([0, 0])
-    cov_s = np.array([[1, 0], [0, 1]])
+    mu_s = np.array([0, 0], dtype=np.float64)
+    cov_s = np.array([[1, 0], [0, 1]], dtype=np.float64)
 
     xs = ot.datasets.make_2D_samples_gauss(n_samples, mu_s, cov_s, random_state=42)
 
@@ -163,23 +210,35 @@ def test_sampled_gromov():
     C1 /= C1.max()
     C2 /= C2.max()
 
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    pb = nx.from_numpy(p)
+    qb = nx.from_numpy(q)
+
     def loss(x, y):
         return np.abs(x - y)
 
+    def lossb(x, y):
+        return nx.abs(x - y)
+
     G, log = ot.gromov.sampled_gromov_wasserstein(
         C1, C2, p, q, loss, max_iter=100, epsilon=1, log=True, verbose=True, random_state=42)
+    Gb, logb = ot.gromov.sampled_gromov_wasserstein(
+        C1b, C2b, pb, qb, lossb, max_iter=100, epsilon=1, log=True, verbose=True, random_state=42)
+    Gb = nx.to_numpy(Gb)
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
-    assert log['gw_dist_estimated'] == 0.05679474884977278
-    assert log['gw_dist_std'] == 0.0005986592106971995
+    np.testing.assert_allclose(logb['gw_dist_estimated'], 0.05679474884977278, atol=1e-08)
+    np.testing.assert_allclose(logb['gw_dist_std'], 0.0005986592106971995, atol=1e-08)
 
 
-def test_gromov_barycenter():
+def test_gromov_barycenter(nx):
     ns = 10
     nt = 20
 
@@ -188,26 +247,42 @@ def test_gromov_barycenter():
 
     C1 = ot.dist(Xs)
     C2 = ot.dist(Xt)
-
+    p1 = ot.unif(ns)
+    p2 = ot.unif(nt)
     n_samples = 3
-    Cb = ot.gromov.gromov_barycenters(n_samples, [C1, C2],
-                                      [ot.unif(ns), ot.unif(nt)
-                                       ], ot.unif(n_samples), [.5, .5],
-                                      'square_loss',  # 5e-4,
-                                      max_iter=100, tol=1e-3,
-                                      verbose=True)
-    np.testing.assert_allclose(Cb.shape, (n_samples, n_samples))
+    p = ot.unif(n_samples)
 
-    Cb2 = ot.gromov.gromov_barycenters(n_samples, [C1, C2],
-                                       [ot.unif(ns), ot.unif(nt)
-                                        ], ot.unif(n_samples), [.5, .5],
-                                       'kl_loss',  # 5e-4,
-                                       max_iter=100, tol=1e-3)
-    np.testing.assert_allclose(Cb2.shape, (n_samples, n_samples))
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    p1b = nx.from_numpy(p1)
+    p2b = nx.from_numpy(p2)
+    pb = nx.from_numpy(p)
+
+    Cb = ot.gromov.gromov_barycenters(
+        n_samples, [C1, C2], [p1, p2], p, [.5, .5],
+        'square_loss', max_iter=100, tol=1e-3, verbose=True, random_state=42
+    )
+    Cbb = nx.to_numpy(ot.gromov.gromov_barycenters(
+        n_samples, [C1b, C2b], [p1b, p2b], pb, [.5, .5],
+        'square_loss', max_iter=100, tol=1e-3, verbose=True, random_state=42
+    ))
+    np.testing.assert_allclose(Cb, Cbb, atol=1e-06)
+    np.testing.assert_allclose(Cbb.shape, (n_samples, n_samples))
+
+    Cb2 = ot.gromov.gromov_barycenters(
+        n_samples, [C1, C2], [p1, p2], p, [.5, .5],
+        'kl_loss', max_iter=100, tol=1e-3, random_state=42
+    )
+    Cb2b = nx.to_numpy(ot.gromov.gromov_barycenters(
+        n_samples, [C1b, C2b], [p1b, p2b], pb, [.5, .5],
+        'kl_loss', max_iter=100, tol=1e-3, random_state=42
+    ))
+    np.testing.assert_allclose(Cb2, Cb2b, atol=1e-06)
+    np.testing.assert_allclose(Cb2b.shape, (n_samples, n_samples))
 
 
 @pytest.mark.filterwarnings("ignore:divide")
-def test_gromov_entropic_barycenter():
+def test_gromov_entropic_barycenter(nx):
     ns = 10
     nt = 20
 
@@ -216,26 +291,41 @@ def test_gromov_entropic_barycenter():
 
     C1 = ot.dist(Xs)
     C2 = ot.dist(Xt)
-
+    p1 = ot.unif(ns)
+    p2 = ot.unif(nt)
     n_samples = 2
-    Cb = ot.gromov.entropic_gromov_barycenters(n_samples, [C1, C2],
-                                               [ot.unif(ns), ot.unif(nt)
-                                                ], ot.unif(n_samples), [.5, .5],
-                                               'square_loss', 1e-3,
-                                               max_iter=50, tol=1e-3,
-                                               verbose=True)
-    np.testing.assert_allclose(Cb.shape, (n_samples, n_samples))
-
-    Cb2 = ot.gromov.entropic_gromov_barycenters(n_samples, [C1, C2],
-                                                [ot.unif(ns), ot.unif(nt)
-                                                 ], ot.unif(n_samples), [.5, .5],
-                                                'kl_loss', 1e-3,
-                                                max_iter=100, tol=1e-3)
-    np.testing.assert_allclose(Cb2.shape, (n_samples, n_samples))
-
-
-def test_fgw():
+    p = ot.unif(n_samples)
 
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    p1b = nx.from_numpy(p1)
+    p2b = nx.from_numpy(p2)
+    pb = nx.from_numpy(p)
+
+    Cb = ot.gromov.entropic_gromov_barycenters(
+        n_samples, [C1, C2], [p1, p2], p, [.5, .5],
+        'square_loss', 1e-3, max_iter=50, tol=1e-3, verbose=True, random_state=42
+    )
+    Cbb = nx.to_numpy(ot.gromov.entropic_gromov_barycenters(
+        n_samples, [C1b, C2b], [p1b, p2b], pb, [.5, .5],
+        'square_loss', 1e-3, max_iter=50, tol=1e-3, verbose=True, random_state=42
+    ))
+    np.testing.assert_allclose(Cb, Cbb, atol=1e-06)
+    np.testing.assert_allclose(Cbb.shape, (n_samples, n_samples))
+
+    Cb2 = ot.gromov.entropic_gromov_barycenters(
+        n_samples, [C1, C2], [p1, p2], p, [.5, .5],
+        'kl_loss', 1e-3, max_iter=100, tol=1e-3, random_state=42
+    )
+    Cb2b = nx.to_numpy(ot.gromov.entropic_gromov_barycenters(
+        n_samples, [C1b, C2b], [p1b, p2b], pb, [.5, .5],
+        'kl_loss', 1e-3, max_iter=100, tol=1e-3, random_state=42
+    ))
+    np.testing.assert_allclose(Cb2, Cb2b, atol=1e-06)
+    np.testing.assert_allclose(Cb2b.shape, (n_samples, n_samples))
+
+
+def test_fgw(nx):
     n_samples = 50  # nb samples
 
     mu_s = np.array([0, 0])
@@ -260,33 +350,46 @@ def test_fgw():
     M = ot.dist(ys, yt)
     M /= M.max()
 
+    Mb = nx.from_numpy(M)
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    pb = nx.from_numpy(p)
+    qb = nx.from_numpy(q)
+
     G, log = ot.gromov.fused_gromov_wasserstein(M, C1, C2, p, q, 'square_loss', alpha=0.5, log=True)
+    Gb, logb = ot.gromov.fused_gromov_wasserstein(Mb, C1b, C2b, pb, qb, 'square_loss', alpha=0.5, log=True)
+    Gb = nx.to_numpy(Gb)
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence fgw
+        p, Gb.sum(1), atol=1e-04)  # cf convergence fgw
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence fgw
+        q, Gb.sum(0), atol=1e-04)  # cf convergence fgw
 
     Id = (1 / (1.0 * n_samples)) * np.eye(n_samples, n_samples)
 
     np.testing.assert_allclose(
-        G, np.flipud(Id), atol=1e-04)  # cf convergence gromov
+        Gb, np.flipud(Id), atol=1e-04)  # cf convergence gromov
 
     fgw, log = ot.gromov.fused_gromov_wasserstein2(M, C1, C2, p, q, 'square_loss', alpha=0.5, log=True)
+    fgwb, logb = ot.gromov.fused_gromov_wasserstein2(Mb, C1b, C2b, pb, qb, 'square_loss', alpha=0.5, log=True)
 
     G = log['T']
+    Gb = nx.to_numpy(logb['T'])
 
-    np.testing.assert_allclose(fgw, 0, atol=1e-1, rtol=1e-1)
+    np.testing.assert_allclose(fgw, fgwb, atol=1e-08)
+    np.testing.assert_allclose(fgwb, 0, atol=1e-1, rtol=1e-1)
 
     # check constraints
+    np.testing.assert_allclose(G, Gb, atol=1e-06)
     np.testing.assert_allclose(
-        p, G.sum(1), atol=1e-04)  # cf convergence gromov
+        p, Gb.sum(1), atol=1e-04)  # cf convergence gromov
     np.testing.assert_allclose(
-        q, G.sum(0), atol=1e-04)  # cf convergence gromov
+        q, Gb.sum(0), atol=1e-04)  # cf convergence gromov
 
 
-def test_fgw_barycenter():
+def test_fgw_barycenter(nx):
     np.random.seed(42)
 
     ns = 50
@@ -300,30 +403,44 @@ def test_fgw_barycenter():
 
     C1 = ot.dist(Xs)
     C2 = ot.dist(Xt)
-
+    p1, p2 = ot.unif(ns), ot.unif(nt)
     n_samples = 3
-    X, C = ot.gromov.fgw_barycenters(n_samples, [ys, yt], [C1, C2], [ot.unif(ns), ot.unif(nt)], [.5, .5], 0.5,
-                                     fixed_structure=False, fixed_features=False,
-                                     p=ot.unif(n_samples), loss_fun='square_loss',
-                                     max_iter=100, tol=1e-3)
-    np.testing.assert_allclose(C.shape, (n_samples, n_samples))
-    np.testing.assert_allclose(X.shape, (n_samples, ys.shape[1]))
+    p = ot.unif(n_samples)
+
+    ysb = nx.from_numpy(ys)
+    ytb = nx.from_numpy(yt)
+    C1b = nx.from_numpy(C1)
+    C2b = nx.from_numpy(C2)
+    p1b = nx.from_numpy(p1)
+    p2b = nx.from_numpy(p2)
+    pb = nx.from_numpy(p)
+
+    Xb, Cb = ot.gromov.fgw_barycenters(
+        n_samples, [ysb, ytb], [C1b, C2b], [p1b, p2b], [.5, .5], 0.5, fixed_structure=False,
+        fixed_features=False, p=pb, loss_fun='square_loss', max_iter=100, tol=1e-3, random_state=12345
+    )
 
     xalea = np.random.randn(n_samples, 2)
     init_C = ot.dist(xalea, xalea)
-
-    X, C = ot.gromov.fgw_barycenters(n_samples, [ys, yt], [C1, C2], ps=[ot.unif(ns), ot.unif(nt)], lambdas=[.5, .5], alpha=0.5,
-                                     fixed_structure=True, init_C=init_C, fixed_features=False,
-                                     p=ot.unif(n_samples), loss_fun='square_loss',
-                                     max_iter=100, tol=1e-3)
-    np.testing.assert_allclose(C.shape, (n_samples, n_samples))
-    np.testing.assert_allclose(X.shape, (n_samples, ys.shape[1]))
+    init_Cb = nx.from_numpy(init_C)
+
+    Xb, Cb = ot.gromov.fgw_barycenters(
+        n_samples, [ysb, ytb], [C1b, C2b], ps=[p1b, p2b], lambdas=[.5, .5],
+        alpha=0.5, fixed_structure=True, init_C=init_Cb, fixed_features=False,
+        p=pb, loss_fun='square_loss', max_iter=100, tol=1e-3
+    )
+    Xb, Cb = nx.to_numpy(Xb), nx.to_numpy(Cb)
+    np.testing.assert_allclose(Cb.shape, (n_samples, n_samples))
+    np.testing.assert_allclose(Xb.shape, (n_samples, ys.shape[1]))
 
     init_X = np.random.randn(n_samples, ys.shape[1])
-
-    X, C, log = ot.gromov.fgw_barycenters(n_samples, [ys, yt], [C1, C2], [ot.unif(ns), ot.unif(nt)], [.5, .5], 0.5,
-                                          fixed_structure=False, fixed_features=True, init_X=init_X,
-                                          p=ot.unif(n_samples), loss_fun='square_loss',
-                                          max_iter=100, tol=1e-3, log=True)
-    np.testing.assert_allclose(C.shape, (n_samples, n_samples))
-    np.testing.assert_allclose(X.shape, (n_samples, ys.shape[1]))
+    init_Xb = nx.from_numpy(init_X)
+
+    Xb, Cb, logb = ot.gromov.fgw_barycenters(
+        n_samples, [ysb, ytb], [C1b, C2b], [p1b, p2b], [.5, .5], 0.5,
+        fixed_structure=False, fixed_features=True, init_X=init_Xb,
+        p=pb, loss_fun='square_loss', max_iter=100, tol=1e-3, log=True, random_state=98765
+    )
+    Xb, Cb = nx.to_numpy(Xb), nx.to_numpy(Cb)
+    np.testing.assert_allclose(Cb.shape, (n_samples, n_samples))
+    np.testing.assert_allclose(Xb.shape, (n_samples, ys.shape[1]))
author	ncassereau-idris <84033440+ncassereau-idris@users.noreply.github.com>	2021-11-02 13:42:02 +0100
committer	GitHub <noreply@github.com>	2021-11-02 13:42:02 +0100
commit	a335324d008e8982be61d7ace937815a2bfa98f9 (patch)
tree	83c7f637597f10f6f3d20b15532e53fc65b51f22
parent	0cb2b2efe901ed74c614046d250518769f870313 (diff)