11 files changed, 1020 insertions, 102 deletions
diff --git a/ot/__init__.py b/ot/__init__.py
index b6dc2b4..e436571 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -50,7 +50,7 @@ from .gromov import (gromov_wasserstein, gromov_wasserstein2,
 # utils functions
 from .utils import dist, unif, tic, toc, toq
 
-__version__ = "0.8.0"
+__version__ = "0.8.1"
 
 __all__ = ['emd', 'emd2', 'emd_1d', 'sinkhorn', 'sinkhorn2', 'utils',
            'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
diff --git a/ot/backend.py b/ot/backend.py
index a044f84..58b652b 100644
--- a/ot/backend.py
+++ b/ot/backend.py
@@ -3,7 +3,7 @@
 Multi-lib backend for POT
 
 The goal is to write backend-agnostic code. Whether you're using Numpy, PyTorch,
-or Jax, POT code should work nonetheless.
+Jax, Cupy, or Tensorflow, POT code should work nonetheless.
 To achieve that, POT provides backend classes which implements functions in their respective backend
 imitating Numpy API. As a convention, we use nx instead of np to refer to the backend.
 
@@ -17,6 +17,68 @@ Examples
 ...     nx = get_backend(a, b)  # infer the backend from the arguments
 ...     c = nx.dot(a, b)  # now use the backend to do any calculation
 ...     return c
+
+.. warning::
+    Tensorflow only works with the Numpy API. To activate it, please run the following:
+
+    .. code-block::
+
+        from tensorflow.python.ops.numpy_ops import np_config
+        np_config.enable_numpy_behavior()
+
+Performance
+--------
+
+- CPU: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
+- GPU: Tesla V100-SXM2-32GB
+- Date of the benchmark: December 8th, 2021
+- Commit of benchmark: PR #316, https://github.com/PythonOT/POT/pull/316
+
+.. raw:: html
+
+    <style>
+    #perftable {
+        width: 100%;
+        margin-bottom: 1em;
+    }
+
+    #perftable table{
+        border-collapse: collapse;
+        table-layout: fixed;
+        width: 100%;
+    }
+
+    #perftable th, #perftable td {
+        border: 1px solid #ddd;
+        padding: 8px;
+        font-size: smaller;
+    }
+    </style>
+
+    <div id="perftable">
+    <table>
+    <tr><th align="center" colspan="8">Sinkhorn Knopp - Averaged on 100 runs</th></tr>
+    <tr><th align="center">Bitsize</th><th align="center" colspan="7">32 bits</th></tr>
+    <tr><th align="center">Device</th><th align="center" colspan="3.0"">CPU</th><th align="center" colspan="4.0">GPU</tr>
+    <tr><th align="center">Sample size</th><th align="center">Numpy</th><th align="center">Pytorch</th><th align="center">Tensorflow</th><th align="center">Cupy</th><th align="center">Jax</th><th align="center">Pytorch</th><th align="center">Tensorflow</th></tr>
+    <tr><td align="center">50</td><td align="center">0.0008</td><td align="center">0.0022</td><td align="center">0.0151</td><td align="center">0.0095</td><td align="center">0.0193</td><td align="center">0.0051</td><td align="center">0.0293</td></tr>
+    <tr><td align="center">100</td><td align="center">0.0005</td><td align="center">0.0013</td><td align="center">0.0097</td><td align="center">0.0057</td><td align="center">0.0115</td><td align="center">0.0029</td><td align="center">0.0173</td></tr>
+    <tr><td align="center">500</td><td align="center">0.0009</td><td align="center">0.0016</td><td align="center">0.0110</td><td align="center">0.0058</td><td align="center">0.0115</td><td align="center">0.0029</td><td align="center">0.0166</td></tr>
+    <tr><td align="center">1000</td><td align="center">0.0021</td><td align="center">0.0021</td><td align="center">0.0145</td><td align="center">0.0056</td><td align="center">0.0118</td><td align="center">0.0029</td><td align="center">0.0168</td></tr>
+    <tr><td align="center">2000</td><td align="center">0.0069</td><td align="center">0.0043</td><td align="center">0.0278</td><td align="center">0.0059</td><td align="center">0.0118</td><td align="center">0.0030</td><td align="center">0.0165</td></tr>
+    <tr><td align="center">5000</td><td align="center">0.0707</td><td align="center">0.0314</td><td align="center">0.1395</td><td align="center">0.0074</td><td align="center">0.0125</td><td align="center">0.0035</td><td align="center">0.0198</td></tr>
+    <tr><td colspan="8">&nbsp;</td></tr>
+    <tr><th align="center">Bitsize</th><th align="center" colspan="7">64 bits</th></tr>
+    <tr><th align="center">Device</th><th align="center" colspan="3.0"">CPU</th><th align="center" colspan="4.0">GPU</tr>
+    <tr><th align="center">Sample size</th><th align="center">Numpy</th><th align="center">Pytorch</th><th align="center">Tensorflow</th><th align="center">Cupy</th><th align="center">Jax</th><th align="center">Pytorch</th><th align="center">Tensorflow</th></tr>
+    <tr><td align="center">50</td><td align="center">0.0008</td><td align="center">0.0020</td><td align="center">0.0154</td><td align="center">0.0093</td><td align="center">0.0191</td><td align="center">0.0051</td><td align="center">0.0328</td></tr>
+    <tr><td align="center">100</td><td align="center">0.0005</td><td align="center">0.0013</td><td align="center">0.0094</td><td align="center">0.0056</td><td align="center">0.0114</td><td align="center">0.0029</td><td align="center">0.0169</td></tr>
+    <tr><td align="center">500</td><td align="center">0.0013</td><td align="center">0.0017</td><td align="center">0.0120</td><td align="center">0.0059</td><td align="center">0.0116</td><td align="center">0.0029</td><td align="center">0.0168</td></tr>
+    <tr><td align="center">1000</td><td align="center">0.0034</td><td align="center">0.0027</td><td align="center">0.0177</td><td align="center">0.0058</td><td align="center">0.0118</td><td align="center">0.0029</td><td align="center">0.0167</td></tr>
+    <tr><td align="center">2000</td><td align="center">0.0146</td><td align="center">0.0075</td><td align="center">0.0436</td><td align="center">0.0059</td><td align="center">0.0120</td><td align="center">0.0029</td><td align="center">0.0165</td></tr>
+    <tr><td align="center">5000</td><td align="center">0.1467</td><td align="center">0.0568</td><td align="center">0.2468</td><td align="center">0.0077</td><td align="center">0.0146</td><td align="center">0.0045</td><td align="center">0.0204</td></tr>
+    </table>
+    </div>
 """
 
 # Author: Remi Flamary <remi.flamary@polytechnique.edu>
@@ -27,6 +89,8 @@ Examples
 import numpy as np
 import scipy.special as scipy
 from scipy.sparse import issparse, coo_matrix, csr_matrix
+import warnings
+import time
 
 try:
     import torch
@@ -39,11 +103,29 @@ try:
     import jax
     import jax.numpy as jnp
     import jax.scipy.special as jscipy
+    from jax.lib import xla_bridge
     jax_type = jax.numpy.ndarray
 except ImportError:
     jax = False
     jax_type = float
 
+try:
+    import cupy as cp
+    import cupyx
+    cp_type = cp.ndarray
+except ImportError:
+    cp = False
+    cp_type = float
+
+try:
+    import tensorflow as tf
+    import tensorflow.experimental.numpy as tnp
+    tf_type = tf.Tensor
+except ImportError:
+    tf = False
+    tf_type = float
+
+
 str_type_error = "All array should be from the same type/backend. Current types are : {}"
 
 
@@ -57,6 +139,12 @@ def get_backend_list():
     if jax:
         lst.append(JaxBackend())
 
+    if cp:  # pragma: no cover
+        lst.append(CupyBackend())
+
+    if tf:
+        lst.append(TensorflowBackend())
+
     return lst
 
 
@@ -78,6 +166,10 @@ def get_backend(*args):
         return TorchBackend()
     elif isinstance(args[0], jax_type):
         return JaxBackend()
+    elif isinstance(args[0], cp_type):  # pragma: no cover
+        return CupyBackend()
+    elif isinstance(args[0], tf_type):
+        return TensorflowBackend()
     else:
         raise ValueError("Unknown type of non implemented backend.")
 
@@ -94,7 +186,8 @@ def to_numpy(*args):
 class Backend():
     """
     Backend abstract class.
-    Implementations: :py:class:`JaxBackend`, :py:class:`NumpyBackend`, :py:class:`TorchBackend`
+    Implementations: :py:class:`JaxBackend`, :py:class:`NumpyBackend`, :py:class:`TorchBackend`,
+    :py:class:`CupyBackend`, :py:class:`TensorflowBackend`
 
     - The `__name__` class attribute refers to the name of the backend.
     - The `__type__` class attribute refers to the data structure used by the backend.
@@ -665,6 +758,34 @@ class Backend():
         """
         raise NotImplementedError()
 
+    def squeeze(self, a, axis=None):
+        r"""
+        Remove axes of length one from a.
+
+        This function follows the api from :any:`numpy.squeeze`.
+
+        See: https://numpy.org/doc/stable/reference/generated/numpy.squeeze.html
+        """
+        raise NotImplementedError()
+
+    def bitsize(self, type_as):
+        r"""
+        Gives the number of bits used by the data type of the given tensor.
+        """
+        raise NotImplementedError()
+
+    def device_type(self, type_as):
+        r"""
+        Returns CPU or GPU depending on the device where the given tensor is located.
+        """
+        raise NotImplementedError()
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        r"""
+        Executes a benchmark of the given callable with the given arguments.
+        """
+        raise NotImplementedError()
+
 
 class NumpyBackend(Backend):
     """
@@ -902,6 +1023,29 @@ class NumpyBackend(Backend):
         # numpy has implicit type conversion so we automatically validate the test
         pass
 
+    def squeeze(self, a, axis=None):
+        return np.squeeze(a, axis=axis)
+
+    def bitsize(self, type_as):
+        return type_as.itemsize * 8
+
+    def device_type(self, type_as):
+        return "CPU"
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        results = dict()
+        for type_as in self.__type_list__:
+            inputs = [self.from_numpy(arg, type_as=type_as) for arg in args]
+            for _ in range(warmup_runs):
+                callable(*inputs)
+            t0 = time.perf_counter()
+            for _ in range(n_runs):
+                callable(*inputs)
+            t1 = time.perf_counter()
+            key = ("Numpy", self.device_type(type_as), self.bitsize(type_as))
+            results[key] = (t1 - t0) / n_runs
+        return results
+
 
 class JaxBackend(Backend):
     """
@@ -920,9 +1064,16 @@ class JaxBackend(Backend):
     def __init__(self):
         self.rng_ = jax.random.PRNGKey(42)
 
-        for d in jax.devices():
-            self.__type_list__ = [jax.device_put(jnp.array(1, dtype=jnp.float32), d),
-                                  jax.device_put(jnp.array(1, dtype=jnp.float64), d)]
+        self.__type_list__ = []
+        # available_devices = jax.devices("cpu")
+        available_devices = []
+        if xla_bridge.get_backend().platform == "gpu":
+            available_devices += jax.devices("gpu")
+        for d in available_devices:
+            self.__type_list__ += [
+                jax.device_put(jnp.array(1, dtype=jnp.float32), d),
+                jax.device_put(jnp.array(1, dtype=jnp.float64), d)
+            ]
 
     def to_numpy(self, a):
         return np.array(a)
@@ -1162,6 +1313,32 @@ class JaxBackend(Backend):
         assert a_dtype == b_dtype, "Dtype discrepancy"
         assert a_device == b_device, f"Device discrepancy. First input is on {str(a_device)}, whereas second input is on {str(b_device)}"
 
+    def squeeze(self, a, axis=None):
+        return jnp.squeeze(a, axis=axis)
+
+    def bitsize(self, type_as):
+        return type_as.dtype.itemsize * 8
+
+    def device_type(self, type_as):
+        return self.dtype_device(type_as)[1].platform.upper()
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        results = dict()
+
+        for type_as in self.__type_list__:
+            inputs = [self.from_numpy(arg, type_as=type_as) for arg in args]
+            for _ in range(warmup_runs):
+                a = callable(*inputs)
+            a.block_until_ready()
+            t0 = time.perf_counter()
+            for _ in range(n_runs):
+                a = callable(*inputs)
+            a.block_until_ready()
+            t1 = time.perf_counter()
+            key = ("Jax", self.device_type(type_as), self.bitsize(type_as))
+            results[key] = (t1 - t0) / n_runs
+        return results
+
 
 class TorchBackend(Backend):
     """
@@ -1203,7 +1380,7 @@ class TorchBackend(Backend):
             @staticmethod
             def backward(ctx, grad_output):
                 # the gradients are grad
-                return (None, None) + ctx.grads
+                return (None, None) + tuple(g * grad_output for g in ctx.grads)
 
         self.ValFunction = ValFunction
 
@@ -1500,3 +1677,690 @@ class TorchBackend(Backend):
 
         assert a_dtype == b_dtype, "Dtype discrepancy"
         assert a_device == b_device, f"Device discrepancy. First input is on {str(a_device)}, whereas second input is on {str(b_device)}"
+
+    def squeeze(self, a, axis=None):
+        if axis is None:
+            return torch.squeeze(a)
+        else:
+            return torch.squeeze(a, dim=axis)
+
+    def bitsize(self, type_as):
+        return torch.finfo(type_as.dtype).bits
+
+    def device_type(self, type_as):
+        return type_as.device.type.replace("cuda", "gpu").upper()
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        results = dict()
+        for type_as in self.__type_list__:
+            inputs = [self.from_numpy(arg, type_as=type_as) for arg in args]
+            for _ in range(warmup_runs):
+                callable(*inputs)
+            if self.device_type(type_as) == "GPU":  # pragma: no cover
+                torch.cuda.synchronize()
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+            else:
+                start = time.perf_counter()
+            for _ in range(n_runs):
+                callable(*inputs)
+            if self.device_type(type_as) == "GPU":  # pragma: no cover
+                end.record()
+                torch.cuda.synchronize()
+                duration = start.elapsed_time(end) / 1000.
+            else:
+                end = time.perf_counter()
+                duration = end - start
+            key = ("Pytorch", self.device_type(type_as), self.bitsize(type_as))
+            results[key] = duration / n_runs
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return results
+
+
+class CupyBackend(Backend):  # pragma: no cover
+    """
+    CuPy implementation of the backend
+
+    - `__name__` is "cupy"
+    - `__type__` is cp.ndarray
+    """
+
+    __name__ = 'cupy'
+    __type__ = cp_type
+    __type_list__ = None
+
+    rng_ = None
+
+    def __init__(self):
+        self.rng_ = cp.random.RandomState()
+
+        self.__type_list__ = [
+            cp.array(1, dtype=cp.float32),
+            cp.array(1, dtype=cp.float64)
+        ]
+
+    def to_numpy(self, a):
+        return cp.asnumpy(a)
+
+    def from_numpy(self, a, type_as=None):
+        if type_as is None:
+            return cp.asarray(a)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cp.asarray(a, dtype=type_as.dtype)
+
+    def set_gradients(self, val, inputs, grads):
+        # No gradients for cupy
+        return val
+
+    def zeros(self, shape, type_as=None):
+        if isinstance(shape, (list, tuple)):
+            shape = tuple(int(i) for i in shape)
+        if type_as is None:
+            return cp.zeros(shape)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cp.zeros(shape, dtype=type_as.dtype)
+
+    def ones(self, shape, type_as=None):
+        if isinstance(shape, (list, tuple)):
+            shape = tuple(int(i) for i in shape)
+        if type_as is None:
+            return cp.ones(shape)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cp.ones(shape, dtype=type_as.dtype)
+
+    def arange(self, stop, start=0, step=1, type_as=None):
+        return cp.arange(start, stop, step)
+
+    def full(self, shape, fill_value, type_as=None):
+        if isinstance(shape, (list, tuple)):
+            shape = tuple(int(i) for i in shape)
+        if type_as is None:
+            return cp.full(shape, fill_value)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cp.full(shape, fill_value, dtype=type_as.dtype)
+
+    def eye(self, N, M=None, type_as=None):
+        if type_as is None:
+            return cp.eye(N, M)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cp.eye(N, M, dtype=type_as.dtype)
+
+    def sum(self, a, axis=None, keepdims=False):
+        return cp.sum(a, axis, keepdims=keepdims)
+
+    def cumsum(self, a, axis=None):
+        return cp.cumsum(a, axis)
+
+    def max(self, a, axis=None, keepdims=False):
+        return cp.max(a, axis, keepdims=keepdims)
+
+    def min(self, a, axis=None, keepdims=False):
+        return cp.min(a, axis, keepdims=keepdims)
+
+    def maximum(self, a, b):
+        return cp.maximum(a, b)
+
+    def minimum(self, a, b):
+        return cp.minimum(a, b)
+
+    def abs(self, a):
+        return cp.abs(a)
+
+    def exp(self, a):
+        return cp.exp(a)
+
+    def log(self, a):
+        return cp.log(a)
+
+    def sqrt(self, a):
+        return cp.sqrt(a)
+
+    def power(self, a, exponents):
+        return cp.power(a, exponents)
+
+    def dot(self, a, b):
+        return cp.dot(a, b)
+
+    def norm(self, a):
+        return cp.sqrt(cp.sum(cp.square(a)))
+
+    def any(self, a):
+        return cp.any(a)
+
+    def isnan(self, a):
+        return cp.isnan(a)
+
+    def isinf(self, a):
+        return cp.isinf(a)
+
+    def einsum(self, subscripts, *operands):
+        return cp.einsum(subscripts, *operands)
+
+    def sort(self, a, axis=-1):
+        return cp.sort(a, axis)
+
+    def argsort(self, a, axis=-1):
+        return cp.argsort(a, axis)
+
+    def searchsorted(self, a, v, side='left'):
+        if a.ndim == 1:
+            return cp.searchsorted(a, v, side)
+        else:
+            # this is a not very efficient way to make numpy
+            # searchsorted work on 2d arrays
+            ret = cp.empty(v.shape, dtype=int)
+            for i in range(a.shape[0]):
+                ret[i, :] = cp.searchsorted(a[i, :], v[i, :], side)
+            return ret
+
+    def flip(self, a, axis=None):
+        return cp.flip(a, axis)
+
+    def outer(self, a, b):
+        return cp.outer(a, b)
+
+    def clip(self, a, a_min, a_max):
+        return cp.clip(a, a_min, a_max)
+
+    def repeat(self, a, repeats, axis=None):
+        return cp.repeat(a, repeats, axis)
+
+    def take_along_axis(self, arr, indices, axis):
+        return cp.take_along_axis(arr, indices, axis)
+
+    def concatenate(self, arrays, axis=0):
+        return cp.concatenate(arrays, axis)
+
+    def zero_pad(self, a, pad_width):
+        return cp.pad(a, pad_width)
+
+    def argmax(self, a, axis=None):
+        return cp.argmax(a, axis=axis)
+
+    def mean(self, a, axis=None):
+        return cp.mean(a, axis=axis)
+
+    def std(self, a, axis=None):
+        return cp.std(a, axis=axis)
+
+    def linspace(self, start, stop, num):
+        return cp.linspace(start, stop, num)
+
+    def meshgrid(self, a, b):
+        return cp.meshgrid(a, b)
+
+    def diag(self, a, k=0):
+        return cp.diag(a, k)
+
+    def unique(self, a):
+        return cp.unique(a)
+
+    def logsumexp(self, a, axis=None):
+        # Taken from
+        # https://github.com/scipy/scipy/blob/v1.7.1/scipy/special/_logsumexp.py#L7-L127
+        a_max = cp.amax(a, axis=axis, keepdims=True)
+
+        if a_max.ndim > 0:
+            a_max[~cp.isfinite(a_max)] = 0
+        elif not cp.isfinite(a_max):
+            a_max = 0
+
+        tmp = cp.exp(a - a_max)
+        s = cp.sum(tmp, axis=axis)
+        out = cp.log(s)
+        a_max = cp.squeeze(a_max, axis=axis)
+        out += a_max
+        return out
+
+    def stack(self, arrays, axis=0):
+        return cp.stack(arrays, axis)
+
+    def reshape(self, a, shape):
+        return cp.reshape(a, shape)
+
+    def seed(self, seed=None):
+        if seed is not None:
+            self.rng_.seed(seed)
+
+    def rand(self, *size, type_as=None):
+        if type_as is None:
+            return self.rng_.rand(*size)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return self.rng_.rand(*size, dtype=type_as.dtype)
+
+    def randn(self, *size, type_as=None):
+        if type_as is None:
+            return self.rng_.randn(*size)
+        else:
+            with cp.cuda.Device(type_as.device):
+                return self.rng_.randn(*size, dtype=type_as.dtype)
+
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        data = self.from_numpy(data)
+        rows = self.from_numpy(rows)
+        cols = self.from_numpy(cols)
+        if type_as is None:
+            return cupyx.scipy.sparse.coo_matrix(
+                (data, (rows, cols)), shape=shape
+            )
+        else:
+            with cp.cuda.Device(type_as.device):
+                return cupyx.scipy.sparse.coo_matrix(
+                    (data, (rows, cols)), shape=shape, dtype=type_as.dtype
+                )
+
+    def issparse(self, a):
+        return cupyx.scipy.sparse.issparse(a)
+
+    def tocsr(self, a):
+        if self.issparse(a):
+            return a.tocsr()
+        else:
+            return cupyx.scipy.sparse.csr_matrix(a)
+
+    def eliminate_zeros(self, a, threshold=0.):
+        if threshold > 0:
+            if self.issparse(a):
+                a.data[self.abs(a.data) <= threshold] = 0
+            else:
+                a[self.abs(a) <= threshold] = 0
+        if self.issparse(a):
+            a.eliminate_zeros()
+        return a
+
+    def todense(self, a):
+        if self.issparse(a):
+            return a.toarray()
+        else:
+            return a
+
+    def where(self, condition, x, y):
+        return cp.where(condition, x, y)
+
+    def copy(self, a):
+        return a.copy()
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        return cp.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+    def dtype_device(self, a):
+        return a.dtype, a.device
+
+    def assert_same_dtype_device(self, a, b):
+        a_dtype, a_device = self.dtype_device(a)
+        b_dtype, b_device = self.dtype_device(b)
+
+        # cupy has implicit type conversion so
+        # we automatically validate the test for type
+        assert a_device == b_device, f"Device discrepancy. First input is on {str(a_device)}, whereas second input is on {str(b_device)}"
+
+    def squeeze(self, a, axis=None):
+        return cp.squeeze(a, axis=axis)
+
+    def bitsize(self, type_as):
+        return type_as.itemsize * 8
+
+    def device_type(self, type_as):
+        return "GPU"
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        mempool = cp.get_default_memory_pool()
+        pinned_mempool = cp.get_default_pinned_memory_pool()
+
+        results = dict()
+        for type_as in self.__type_list__:
+            inputs = [self.from_numpy(arg, type_as=type_as) for arg in args]
+            start_gpu = cp.cuda.Event()
+            end_gpu = cp.cuda.Event()
+            for _ in range(warmup_runs):
+                callable(*inputs)
+            start_gpu.synchronize()
+            start_gpu.record()
+            for _ in range(n_runs):
+                callable(*inputs)
+            end_gpu.record()
+            end_gpu.synchronize()
+            key = ("Cupy", self.device_type(type_as), self.bitsize(type_as))
+            t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu) / 1000.
+            results[key] = t_gpu / n_runs
+        mempool.free_all_blocks()
+        pinned_mempool.free_all_blocks()
+        return results
+
+
+class TensorflowBackend(Backend):
+
+    __name__ = "tf"
+    __type__ = tf_type
+    __type_list__ = None
+
+    rng_ = None
+
+    def __init__(self):
+        self.seed(None)
+
+        self.__type_list__ = [
+            tf.convert_to_tensor([1], dtype=tf.float32),
+            tf.convert_to_tensor([1], dtype=tf.float64)
+        ]
+
+        tmp = self.randn(15, 10)
+        try:
+            tmp.reshape((150, 1))
+        except AttributeError:
+            warnings.warn(
+                "To use TensorflowBackend, you need to activate the tensorflow "
+                "numpy API. You can activate it by running: \n"
+                "from tensorflow.python.ops.numpy_ops import np_config\n"
+                "np_config.enable_numpy_behavior()"
+            )
+
+    def to_numpy(self, a):
+        return a.numpy()
+
+    def from_numpy(self, a, type_as=None):
+        if not isinstance(a, self.__type__):
+            if type_as is None:
+                return tf.convert_to_tensor(a)
+            else:
+                return tf.convert_to_tensor(a, dtype=type_as.dtype)
+        else:
+            if type_as is None:
+                return a
+            else:
+                return tf.cast(a, dtype=type_as.dtype)
+
+    def set_gradients(self, val, inputs, grads):
+        @tf.custom_gradient
+        def tmp(input):
+            def grad(upstream):
+                return grads
+            return val, grad
+        return tmp(inputs)
+
+    def zeros(self, shape, type_as=None):
+        if type_as is None:
+            return tnp.zeros(shape)
+        else:
+            return tnp.zeros(shape, dtype=type_as.dtype)
+
+    def ones(self, shape, type_as=None):
+        if type_as is None:
+            return tnp.ones(shape)
+        else:
+            return tnp.ones(shape, dtype=type_as.dtype)
+
+    def arange(self, stop, start=0, step=1, type_as=None):
+        return tnp.arange(start, stop, step)
+
+    def full(self, shape, fill_value, type_as=None):
+        if type_as is None:
+            return tnp.full(shape, fill_value)
+        else:
+            return tnp.full(shape, fill_value, dtype=type_as.dtype)
+
+    def eye(self, N, M=None, type_as=None):
+        if type_as is None:
+            return tnp.eye(N, M)
+        else:
+            return tnp.eye(N, M, dtype=type_as.dtype)
+
+    def sum(self, a, axis=None, keepdims=False):
+        return tnp.sum(a, axis, keepdims=keepdims)
+
+    def cumsum(self, a, axis=None):
+        return tnp.cumsum(a, axis)
+
+    def max(self, a, axis=None, keepdims=False):
+        return tnp.max(a, axis, keepdims=keepdims)
+
+    def min(self, a, axis=None, keepdims=False):
+        return tnp.min(a, axis, keepdims=keepdims)
+
+    def maximum(self, a, b):
+        return tnp.maximum(a, b)
+
+    def minimum(self, a, b):
+        return tnp.minimum(a, b)
+
+    def dot(self, a, b):
+        if len(b.shape) == 1:
+            if len(a.shape) == 1:
+                # inner product
+                return tf.reduce_sum(tf.multiply(a, b))
+            else:
+                # matrix vector
+                return tf.linalg.matvec(a, b)
+        else:
+            if len(a.shape) == 1:
+                return tf.linalg.matvec(b.T, a.T).T
+            else:
+                return tf.matmul(a, b)
+
+    def abs(self, a):
+        return tnp.abs(a)
+
+    def exp(self, a):
+        return tnp.exp(a)
+
+    def log(self, a):
+        return tnp.log(a)
+
+    def sqrt(self, a):
+        return tnp.sqrt(a)
+
+    def power(self, a, exponents):
+        return tnp.power(a, exponents)
+
+    def norm(self, a):
+        return tf.math.reduce_euclidean_norm(a)
+
+    def any(self, a):
+        return tnp.any(a)
+
+    def isnan(self, a):
+        return tnp.isnan(a)
+
+    def isinf(self, a):
+        return tnp.isinf(a)
+
+    def einsum(self, subscripts, *operands):
+        return tnp.einsum(subscripts, *operands)
+
+    def sort(self, a, axis=-1):
+        return tnp.sort(a, axis)
+
+    def argsort(self, a, axis=-1):
+        return tnp.argsort(a, axis)
+
+    def searchsorted(self, a, v, side='left'):
+        return tf.searchsorted(a, v, side=side)
+
+    def flip(self, a, axis=None):
+        return tnp.flip(a, axis)
+
+    def outer(self, a, b):
+        return tnp.outer(a, b)
+
+    def clip(self, a, a_min, a_max):
+        return tnp.clip(a, a_min, a_max)
+
+    def repeat(self, a, repeats, axis=None):
+        return tnp.repeat(a, repeats, axis)
+
+    def take_along_axis(self, arr, indices, axis):
+        return tnp.take_along_axis(arr, indices, axis)
+
+    def concatenate(self, arrays, axis=0):
+        return tnp.concatenate(arrays, axis)
+
+    def zero_pad(self, a, pad_width):
+        return tnp.pad(a, pad_width, mode="constant")
+
+    def argmax(self, a, axis=None):
+        return tnp.argmax(a, axis=axis)
+
+    def mean(self, a, axis=None):
+        return tnp.mean(a, axis=axis)
+
+    def std(self, a, axis=None):
+        return tnp.std(a, axis=axis)
+
+    def linspace(self, start, stop, num):
+        return tnp.linspace(start, stop, num)
+
+    def meshgrid(self, a, b):
+        return tnp.meshgrid(a, b)
+
+    def diag(self, a, k=0):
+        return tnp.diag(a, k)
+
+    def unique(self, a):
+        return tf.sort(tf.unique(tf.reshape(a, [-1]))[0])
+
+    def logsumexp(self, a, axis=None):
+        return tf.math.reduce_logsumexp(a, axis=axis)
+
+    def stack(self, arrays, axis=0):
+        return tnp.stack(arrays, axis)
+
+    def reshape(self, a, shape):
+        return tnp.reshape(a, shape)
+
+    def seed(self, seed=None):
+        if isinstance(seed, int):
+            self.rng_ = tf.random.Generator.from_seed(seed)
+        elif isinstance(seed, tf.random.Generator):
+            self.rng_ = seed
+        elif seed is None:
+            self.rng_ = tf.random.Generator.from_non_deterministic_state()
+        else:
+            raise ValueError("Non compatible seed : {}".format(seed))
+
+    def rand(self, *size, type_as=None):
+        if type_as is None:
+            return self.rng_.uniform(size, minval=0., maxval=1.)
+        else:
+            return self.rng_.uniform(
+                size, minval=0., maxval=1., dtype=type_as.dtype
+            )
+
+    def randn(self, *size, type_as=None):
+        if type_as is None:
+            return self.rng_.normal(size)
+        else:
+            return self.rng_.normal(size, dtype=type_as.dtype)
+
+    def _convert_to_index_for_coo(self, tensor):
+        if isinstance(tensor, self.__type__):
+            return int(self.max(tensor)) + 1
+        else:
+            return int(np.max(tensor)) + 1
+
+    def coo_matrix(self, data, rows, cols, shape=None, type_as=None):
+        if shape is None:
+            shape = (
+                self._convert_to_index_for_coo(rows),
+                self._convert_to_index_for_coo(cols)
+            )
+        if type_as is not None:
+            data = self.from_numpy(data, type_as=type_as)
+
+        sparse_tensor = tf.sparse.SparseTensor(
+            indices=tnp.stack([rows, cols]).T,
+            values=data,
+            dense_shape=shape
+        )
+        # if type_as is not None:
+        #     sparse_tensor = self.from_numpy(sparse_tensor, type_as=type_as)
+        # SparseTensor are not subscriptable so we use dense tensors
+        return self.todense(sparse_tensor)
+
+    def issparse(self, a):
+        return isinstance(a, tf.sparse.SparseTensor)
+
+    def tocsr(self, a):
+        return a
+
+    def eliminate_zeros(self, a, threshold=0.):
+        if self.issparse(a):
+            values = a.values
+            if threshold > 0:
+                mask = self.abs(values) <= threshold
+            else:
+                mask = values == 0
+            return tf.sparse.retain(a, ~mask)
+        else:
+            if threshold > 0:
+                a = tnp.where(self.abs(a) > threshold, a, 0.)
+            return a
+
+    def todense(self, a):
+        if self.issparse(a):
+            return tf.sparse.to_dense(tf.sparse.reorder(a))
+        else:
+            return a
+
+    def where(self, condition, x, y):
+        return tnp.where(condition, x, y)
+
+    def copy(self, a):
+        return tf.identity(a)
+
+    def allclose(self, a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
+        return tnp.allclose(
+            a, b, rtol=rtol, atol=atol, equal_nan=equal_nan
+        )
+
+    def dtype_device(self, a):
+        return a.dtype, a.device.split("device:")[1]
+
+    def assert_same_dtype_device(self, a, b):
+        a_dtype, a_device = self.dtype_device(a)
+        b_dtype, b_device = self.dtype_device(b)
+
+        assert a_dtype == b_dtype, "Dtype discrepancy"
+        assert a_device == b_device, f"Device discrepancy. First input is on {str(a_device)}, whereas second input is on {str(b_device)}"
+
+    def squeeze(self, a, axis=None):
+        return tnp.squeeze(a, axis=axis)
+
+    def bitsize(self, type_as):
+        return type_as.dtype.size * 8
+
+    def device_type(self, type_as):
+        return self.dtype_device(type_as)[1].split(":")[0]
+
+    def _bench(self, callable, *args, n_runs=1, warmup_runs=1):
+        results = dict()
+        device_contexts = [tf.device("/CPU:0")]
+        if len(tf.config.list_physical_devices('GPU')) > 0:  # pragma: no cover
+            device_contexts.append(tf.device("/GPU:0"))
+
+        for device_context in device_contexts:
+            with device_context:
+                for type_as in self.__type_list__:
+                    inputs = [self.from_numpy(arg, type_as=type_as) for arg in args]
+                    for _ in range(warmup_runs):
+                        callable(*inputs)
+                    t0 = time.perf_counter()
+                    for _ in range(n_runs):
+                        res = callable(*inputs)
+                    _ = res.numpy()
+                    t1 = time.perf_counter()
+                    key = (
+                        "Tensorflow",
+                        self.device_type(inputs[0]),
+                        self.bitsize(type_as)
+                    )
+                    results[key] = (t1 - t0) / n_runs
+
+        return results
diff --git a/ot/bregman.py b/ot/bregman.py
index cce52e2..fc20175 100644
--- a/ot/bregman.py
+++ b/ot/bregman.py
@@ -830,9 +830,9 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
     a, b, M = list_to_array(a, b, M)
 
     nx = get_backend(M, a, b)
-    if nx.__name__ == "jax":
-        raise TypeError("JAX arrays have been received. Greenkhorn is not "
-                        "compatible with JAX")
+    if nx.__name__ in ("jax", "tf"):
+        raise TypeError("JAX or TF arrays have been received. Greenkhorn is not "
+                        "compatible with  neither JAX nor TF")
 
     if len(a) == 0:
         a = nx.ones((M.shape[0],), type_as=M) / M.shape[0]
@@ -865,20 +865,20 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False,
 
         if m_viol_1 > m_viol_2:
             old_u = u[i_1]
-            new_u = a[i_1] / (K[i_1, :].dot(v))
+            new_u = a[i_1] / nx.dot(K[i_1, :], v)
             G[i_1, :] = new_u * K[i_1, :] * v
 
-            viol[i_1] = new_u * K[i_1, :].dot(v) - a[i_1]
+            viol[i_1] = nx.dot(new_u * K[i_1, :], v) - a[i_1]
             viol_2 += (K[i_1, :].T * (new_u - old_u) * v)
             u[i_1] = new_u
         else:
             old_v = v[i_2]
-            new_v = b[i_2] / (K[:, i_2].T.dot(u))
+            new_v = b[i_2] / nx.dot(K[:, i_2].T, u)
             G[:, i_2] = u * K[:, i_2] * new_v
             # aviol = (G@one_m - a)
             # aviol_2 = (G.T@one_n - b)
             viol += (-old_v + new_v) * K[:, i_2] * u
-            viol_2[i_2] = new_v * K[:, i_2].dot(u) - b[i_2]
+            viol_2[i_2] = new_v * nx.dot(K[:, i_2], u) - b[i_2]
             v[i_2] = new_v
 
         if stopThr_val <= stopThr:
@@ -1550,9 +1550,11 @@ def _barycenter_sinkhorn_log(A, M, reg, weights=None, numItermax=1000,
 
     nx = get_backend(A, M)
 
-    if nx.__name__ == "jax":
-        raise NotImplementedError("Log-domain functions are not yet implemented"
-                                  " for Jax. Use numpy or torch arrays instead.")
+    if nx.__name__ in ("jax", "tf"):
+        raise NotImplementedError(
+            "Log-domain functions are not yet implemented"
+            " for Jax and tf. Use numpy or torch arrays instead."
+        )
 
     if weights is None:
         weights = nx.ones(n_hists, type_as=A) / n_hists
@@ -1886,9 +1888,11 @@ def _barycenter_debiased_log(A, M, reg, weights=None, numItermax=1000,
     dim, n_hists = A.shape
 
     nx = get_backend(A, M)
-    if nx.__name__ == "jax":
-        raise NotImplementedError("Log-domain functions are not yet implemented"
-                                  " for Jax. Use numpy or torch arrays instead.")
+    if nx.__name__ in ("jax", "tf"):
+        raise NotImplementedError(
+            "Log-domain functions are not yet implemented"
+            " for Jax and TF. Use numpy or torch arrays instead."
+        )
 
     if weights is None:
         weights = nx.ones(n_hists, type_as=A) / n_hists
@@ -2043,7 +2047,7 @@ def _convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
         log = {'err': []}
 
     bar = nx.ones(A.shape[1:], type_as=A)
-    bar /= bar.sum()
+    bar /= nx.sum(bar)
     U = nx.ones(A.shape, type_as=A)
     V = nx.ones(A.shape, type_as=A)
     err = 1
@@ -2069,9 +2073,11 @@ def _convolutional_barycenter2d(A, reg, weights=None, numItermax=10000,
         KV = convol_imgs(V)
         U = A / KV
         KU = convol_imgs(U)
-        bar = nx.exp((weights[:, None, None] * nx.log(KU + stabThr)).sum(axis=0))
+        bar = nx.exp(
+            nx.sum(weights[:, None, None] * nx.log(KU + stabThr), axis=0)
+        )
         if ii % 10 == 9:
-            err = (V * KU).std(axis=0).sum()
+            err = nx.sum(nx.std(V * KU, axis=0))
             # log and verbose print
             if log:
                 log['err'].append(err)
@@ -2106,9 +2112,11 @@ def _convolutional_barycenter2d_log(A, reg, weights=None, numItermax=10000,
     A = list_to_array(A)
 
     nx = get_backend(A)
-    if nx.__name__ == "jax":
-        raise NotImplementedError("Log-domain functions are not yet implemented"
-                                  " for Jax. Use numpy or torch arrays instead.")
+    if nx.__name__ in ("jax", "tf"):
+        raise NotImplementedError(
+            "Log-domain functions are not yet implemented"
+            " for Jax and TF. Use numpy or torch arrays instead."
+        )
 
     n_hists, width, height = A.shape
 
@@ -2298,13 +2306,15 @@ def _convolutional_barycenter2d_debiased(A, reg, weights=None, numItermax=10000,
         KV = convol_imgs(V)
         U = A / KV
         KU = convol_imgs(U)
-        bar = c * nx.exp((weights[:, None, None] * nx.log(KU + stabThr)).sum(axis=0))
+        bar = c * nx.exp(
+            nx.sum(weights[:, None, None] * nx.log(KU + stabThr), axis=0)
+        )
 
         for _ in range(10):
-            c = (c * bar / convol_imgs(c[None]).squeeze()) ** 0.5
+            c = (c * bar / nx.squeeze(convol_imgs(c[None]))) ** 0.5
 
         if ii % 10 == 9:
-            err = (V * KU).std(axis=0).sum()
+            err = nx.sum(nx.std(V * KU, axis=0))
             # log and verbose print
             if log:
                 log['err'].append(err)
@@ -2340,9 +2350,11 @@ def _convolutional_barycenter2d_debiased_log(A, reg, weights=None, numItermax=10
     A = list_to_array(A)
     n_hists, width, height = A.shape
     nx = get_backend(A)
-    if nx.__name__ == "jax":
-        raise NotImplementedError("Log-domain functions are not yet implemented"
-                                  " for Jax. Use numpy or torch arrays instead.")
+    if nx.__name__ in ("jax", "tf"):
+        raise NotImplementedError(
+            "Log-domain functions are not yet implemented"
+            " for Jax and TF. Use numpy or torch arrays instead."
+        )
     if weights is None:
         weights = nx.ones((n_hists,), type_as=A) / n_hists
     else:
@@ -2382,7 +2394,7 @@ def _convolutional_barycenter2d_debiased_log(A, reg, weights=None, numItermax=10
             c = 0.5 * (c + log_bar - convol_img(c))
 
         if ii % 10 == 9:
-            err = nx.exp(G + log_KU).std(axis=0).sum()
+            err = nx.sum(nx.std(nx.exp(G + log_KU), axis=0))
             # log and verbose print
             if log:
                 log['err'].append(err)
@@ -3312,9 +3324,9 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False,
     a, b, M = list_to_array(a, b, M)
 
     nx = get_backend(M, a, b)
-    if nx.__name__ == "jax":
-        raise TypeError("JAX arrays have been received but screenkhorn is not "
-                        "compatible with JAX.")
+    if nx.__name__ in ("jax", "tf"):
+        raise TypeError("JAX or TF arrays have been received but screenkhorn is not "
+                        "compatible with neither JAX nor TF.")
 
     ns, nt = M.shape
 
@@ -3328,7 +3340,7 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False,
     K = nx.exp(-M / reg)
 
     def projection(u, epsilon):
-        u[u <= epsilon] = epsilon
+        u = nx.maximum(u, epsilon)
         return u
 
     # ----------------------------------------------------------------------------------------------------------------#
diff --git a/ot/da.py b/ot/da.py
index 4fd97df..841f31a 100644
--- a/ot/da.py
+++ b/ot/da.py
@@ -906,7 +906,7 @@ def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, al
 
 
 def distribution_estimation_uniform(X):
-    """estimates a uniform distribution from an array of samples :math:`\mathbf{X}`
+    r"""estimates a uniform distribution from an array of samples :math:`\mathbf{X}`
 
     Parameters
     ----------
@@ -950,7 +950,7 @@ class BaseTransport(BaseEstimator):
     """
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1010,7 +1010,7 @@ class BaseTransport(BaseEstimator):
         return self
 
     def fit_transform(self, Xs=None, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
         and transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
@@ -1038,7 +1038,7 @@ class BaseTransport(BaseEstimator):
         return self.fit(Xs, ys, Xt, yt).transform(Xs, ys, Xt, yt)
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
+        r"""Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -1105,7 +1105,7 @@ class BaseTransport(BaseEstimator):
             return transp_Xs
 
     def transform_labels(self, ys=None):
-        """Propagate source labels :math:`\mathbf{y_s}` to obtain estimated target labels as in
+        r"""Propagate source labels :math:`\mathbf{y_s}` to obtain estimated target labels as in
         :ref:`[27] <references-basetransport-transform-labels>`.
 
         Parameters
@@ -1152,7 +1152,7 @@ class BaseTransport(BaseEstimator):
 
     def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
                           batch_size=128):
-        """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
+        r"""Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
 
         Parameters
         ----------
@@ -1218,7 +1218,7 @@ class BaseTransport(BaseEstimator):
             return transp_Xt
 
     def inverse_transform_labels(self, yt=None):
-        """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
+        r"""Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
         :math:`\mathbf{y_s}`
 
         Parameters
@@ -1307,7 +1307,7 @@ class LinearTransport(BaseTransport):
         self.distribution_estimation = distribution_estimation
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1354,7 +1354,7 @@ class LinearTransport(BaseTransport):
         return self
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
+        r"""Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -1387,7 +1387,7 @@ class LinearTransport(BaseTransport):
 
     def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None,
                           batch_size=128):
-        """Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
+        r"""Transports target samples :math:`\mathbf{X_t}` onto source samples :math:`\mathbf{X_s}`
 
         Parameters
         ----------
@@ -1493,7 +1493,7 @@ class SinkhornTransport(BaseTransport):
         self.out_of_sample_map = out_of_sample_map
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1592,7 +1592,7 @@ class EMDTransport(BaseTransport):
         self.max_iter = max_iter
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1711,7 +1711,7 @@ class SinkhornLpl1Transport(BaseTransport):
         self.limit_max = limit_max
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1839,7 +1839,7 @@ class EMDLaplaceTransport(BaseTransport):
         self.out_of_sample_map = out_of_sample_map
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -1962,7 +1962,7 @@ class SinkhornL1l2Transport(BaseTransport):
         self.limit_max = limit_max
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -2088,7 +2088,7 @@ class MappingTransport(BaseEstimator):
         self.verbose2 = verbose2
 
     def fit(self, Xs=None, ys=None, Xt=None, yt=None):
-        """Builds an optimal coupling and estimates the associated mapping
+        r"""Builds an optimal coupling and estimates the associated mapping
         from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
@@ -2146,7 +2146,7 @@ class MappingTransport(BaseEstimator):
         return self
 
     def transform(self, Xs):
-        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
+        r"""Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -2261,7 +2261,7 @@ class UnbalancedSinkhornTransport(BaseTransport):
         self.limit_max = limit_max
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Build a coupling matrix from source and target sets of samples
+        r"""Build a coupling matrix from source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -2373,7 +2373,7 @@ class JCPOTTransport(BaseTransport):
         self.out_of_sample_map = out_of_sample_map
 
     def fit(self, Xs, ys=None, Xt=None, yt=None):
-        """Building coupling matrices from a list of source and target sets of samples
+        r"""Building coupling matrices from a list of source and target sets of samples
         :math:`(\mathbf{X_s}, \mathbf{y_s})` and :math:`(\mathbf{X_t}, \mathbf{y_t})`
 
         Parameters
@@ -2419,7 +2419,7 @@ class JCPOTTransport(BaseTransport):
         return self
 
     def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128):
-        """Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
+        r"""Transports source samples :math:`\mathbf{X_s}` onto target ones :math:`\mathbf{X_t}`
 
         Parameters
         ----------
@@ -2491,7 +2491,7 @@ class JCPOTTransport(BaseTransport):
             return transp_Xs
 
     def transform_labels(self, ys=None):
-        """Propagate source labels :math:`\mathbf{y_s}` to obtain target labels as in
+        r"""Propagate source labels :math:`\mathbf{y_s}` to obtain target labels as in
         :ref:`[27] <references-jcpottransport-transform-labels>`
 
         Parameters
@@ -2542,7 +2542,7 @@ class JCPOTTransport(BaseTransport):
             return yt.T
 
     def inverse_transform_labels(self, yt=None):
-        """Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
+        r"""Propagate target labels :math:`\mathbf{y_t}` to obtain estimated source labels
         :math:`\mathbf{y_s}`
 
         Parameters
diff --git a/ot/datasets.py b/ot/datasets.py
index ad6390c..a839074 100644
--- a/ot/datasets.py
+++ b/ot/datasets.py
@@ -41,7 +41,7 @@ def get_1D_gauss(n, m, sigma):
 
 
 def make_2D_samples_gauss(n, m, sigma, random_state=None):
-    """Return `n` samples drawn from 2D gaussian :math:`\mathcal{N}(m, \sigma)`
+    r"""Return `n` samples drawn from 2D gaussian :math:`\mathcal{N}(m, \sigma)`
 
     Parameters
     ----------
diff --git a/ot/dr.py b/ot/dr.py
index c2f51f8..1671ca0 100644
--- a/ot/dr.py
+++ b/ot/dr.py
@@ -16,6 +16,7 @@ Dimension reduction with OT
 
 from scipy import linalg
 import autograd.numpy as np
+from pymanopt.function import Autograd
 from pymanopt.manifolds import Stiefel
 from pymanopt import Problem
 from pymanopt.solvers import SteepestDescent, TrustRegions
@@ -181,6 +182,7 @@ def wda(X, y, p=2, reg=1, k=10, solver=None, maxiter=100, verbose=0, P0=None, no
     else:
         regmean = np.ones((len(xc), len(xc)))
 
+    @Autograd
     def cost(P):
         # wda loss
         loss_b = 0
diff --git a/ot/gromov.py b/ot/gromov.py
index ea667e4..6544260 100644
--- a/ot/gromov.py
+++ b/ot/gromov.py
@@ -822,8 +822,12 @@ def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
     index_k = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
     index_l = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
 
-    index_i = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
-    index_j = generator.choice(len_p, size=nb_samples_p, p=p, replace=False)
+    index_i = generator.choice(
+        len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False
+    )
+    index_j = generator.choice(
+        len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False
+    )
 
     for i in range(nb_samples_p):
         if nx.issparse(T):
@@ -836,13 +840,13 @@ def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
         index_k[i] = generator.choice(
             len_q,
             size=nb_samples_q,
-            p=T_indexi / nx.sum(T_indexi),
+            p=nx.to_numpy(T_indexi / nx.sum(T_indexi)),
             replace=True
         )
         index_l[i] = generator.choice(
             len_q,
             size=nb_samples_q,
-            p=T_indexj / nx.sum(T_indexj),
+            p=nx.to_numpy(T_indexj / nx.sum(T_indexj)),
             replace=True
         )
 
@@ -934,15 +938,17 @@ def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
     index = np.zeros(2, dtype=int)
 
     # Initialize with default marginal
-    index[0] = generator.choice(len_p, size=1, p=p)
-    index[1] = generator.choice(len_q, size=1, p=q)
+    index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p))
+    index[1] = generator.choice(len_q, size=1, p=nx.to_numpy(q))
     T = nx.tocsr(emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False))
 
     best_gw_dist_estimated = np.inf
     for cpt in range(max_iter):
-        index[0] = generator.choice(len_p, size=1, p=p)
+        index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p))
         T_index0 = nx.reshape(nx.todense(T[index[0], :]), (-1,))
-        index[1] = generator.choice(len_q, size=1, p=T_index0 / T_index0.sum())
+        index[1] = generator.choice(
+            len_q, size=1, p=nx.to_numpy(T_index0 / nx.sum(T_index0))
+        )
 
         if alpha == 1:
             T = nx.tocsr(
@@ -1071,13 +1077,16 @@ def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
     C_are_symmetric = nx.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and nx.allclose(C2, C2.T, rtol=1e-10, atol=1e-10)
 
     for cpt in range(max_iter):
-        index0 = generator.choice(len_p, size=nb_samples_grad_p, p=p, replace=False)
+        index0 = generator.choice(
+            len_p, size=nb_samples_grad_p, p=nx.to_numpy(p), replace=False
+        )
         Lik = 0
         for i, index0_i in enumerate(index0):
-            index1 = generator.choice(len_q,
-                                      size=nb_samples_grad_q,
-                                      p=T[index0_i, :] / nx.sum(T[index0_i, :]),
-                                      replace=False)
+            index1 = generator.choice(
+                len_q, size=nb_samples_grad_q,
+                p=nx.to_numpy(T[index0_i, :] / nx.sum(T[index0_i, :])),
+                replace=False
+            )
             # If the matrices C are not symmetric, the gradient has 2 terms, thus the term is chosen randomly.
             if (not C_are_symmetric) and generator.rand(1) > 0.5:
                 Lik += nx.mean(loss_fun(
@@ -1359,6 +1368,8 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
     -------
     C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
+    log : dict
+        Log dictionary of error during iterations. Return only if `log=True` in parameters.
 
     References
     ----------
@@ -1392,7 +1403,7 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
         Cprev = C
 
         T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
-                                         max_iter, 1e-4, verbose, log) for s in range(S)]
+                                         max_iter, 1e-4, verbose, log=False) for s in range(S)]
         if loss_fun == 'square_loss':
             C = update_square_loss(p, lambdas, T, Cs)
 
@@ -1405,9 +1416,6 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
             err = nx.norm(C - Cprev)
             error.append(err)
 
-            if log:
-                log['err'].append(err)
-
             if verbose:
                 if cpt % 200 == 0:
                     print('{:5s}|{:12s}'.format(
@@ -1416,7 +1424,10 @@ def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
 
         cpt += 1
 
-    return C
+    if log:
+        return C, {"err": error}
+    else:
+        return C
 
 
 def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
@@ -1470,6 +1481,8 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
     -------
     C : array-like, shape (`N`, `N`)
         Similarity matrix in the barycenter space (permutated arbitrarily)
+    log : dict
+        Log dictionary of error during iterations. Return only if `log=True` in parameters.
 
     References
     ----------
@@ -1504,7 +1517,7 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
         Cprev = C
 
         T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun,
-                                numItermax=max_iter, stopThr=1e-5, verbose=verbose, log=log) for s in range(S)]
+                                numItermax=max_iter, stopThr=1e-5, verbose=verbose, log=False) for s in range(S)]
         if loss_fun == 'square_loss':
             C = update_square_loss(p, lambdas, T, Cs)
 
@@ -1517,9 +1530,6 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
             err = nx.norm(C - Cprev)
             error.append(err)
 
-            if log:
-                log['err'].append(err)
-
             if verbose:
                 if cpt % 200 == 0:
                     print('{:5s}|{:12s}'.format(
@@ -1528,7 +1538,10 @@ def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
 
         cpt += 1
 
-    return C
+    if log:
+        return C, {"err": error}
+    else:
+        return C
 
 
 def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
diff --git a/ot/lp/solver_1d.py b/ot/lp/solver_1d.py
index 8b4d0c3..43763a9 100644
--- a/ot/lp/solver_1d.py
+++ b/ot/lp/solver_1d.py
@@ -100,11 +100,11 @@ def wasserstein_1d(u_values, v_values, u_weights=None, v_weights=None, p=1, requ
     m = v_values.shape[0]
 
     if u_weights is None:
-        u_weights = nx.full(u_values.shape, 1. / n)
+        u_weights = nx.full(u_values.shape, 1. / n, type_as=u_values)
     elif u_weights.ndim != u_values.ndim:
         u_weights = nx.repeat(u_weights[..., None], u_values.shape[-1], -1)
     if v_weights is None:
-        v_weights = nx.full(v_values.shape, 1. / m)
+        v_weights = nx.full(v_values.shape, 1. / m, type_as=v_values)
     elif v_weights.ndim != v_values.ndim:
         v_weights = nx.repeat(v_weights[..., None], v_values.shape[-1], -1)
 
diff --git a/ot/optim.py b/ot/optim.py
index bd8ca26..f25e2c9 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -18,8 +18,10 @@ from .backend import get_backend
 # The corresponding scipy function does not work for matrices
 
 
-def line_search_armijo(f, xk, pk, gfk, old_fval,
-                       args=(), c1=1e-4, alpha0=0.99):
+def line_search_armijo(
+    f, xk, pk, gfk, old_fval, args=(), c1=1e-4,
+    alpha0=0.99, alpha_min=None, alpha_max=None
+):
     r"""
     Armijo linesearch function that works with matrices
 
@@ -44,6 +46,10 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
         :math:`c_1` const in armijo rule (>0)
     alpha0 : float, optional
         initial step (>0)
+    alpha_min : float, optional
+        minimum value for alpha
+    alpha_max : float, optional
+        maximum value for alpha
 
     Returns
     -------
@@ -77,14 +83,18 @@ def line_search_armijo(f, xk, pk, gfk, old_fval,
     alpha, phi1 = scalar_search_armijo(
         phi, phi0, derphi0, c1=c1, alpha0=alpha0)
 
-    # scalar_search_armijo can return alpha > 1
-    if alpha is not None:
-        alpha = min(1, alpha)
-    return alpha, fc[0], phi1
+    if alpha is None:
+        return 0., fc[0], phi0
+    else:
+        if alpha_min is not None or alpha_max is not None:
+            alpha = np.clip(alpha, alpha_min, alpha_max)
+        return float(alpha), fc[0], phi1
 
 
-def solve_linesearch(cost, G, deltaG, Mi, f_val,
-                     armijo=True, C1=None, C2=None, reg=None, Gc=None, constC=None, M=None):
+def solve_linesearch(
+    cost, G, deltaG, Mi, f_val, armijo=True, C1=None, C2=None,
+    reg=None, Gc=None, constC=None, M=None, alpha_min=None, alpha_max=None
+):
     """
     Solve the linesearch in the FW iterations
 
@@ -115,6 +125,10 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val,
         Constant for the gromov cost. See :ref:`[24] <references-solve-linesearch>`. Only used and necessary when armijo=False
     M : array-like (ns,nt), optional
         Cost matrix between the features. Only used and necessary when armijo=False
+    alpha_min : float, optional
+        Minimum value for alpha
+    alpha_max : float, optional
+        Maximum value for alpha
 
     Returns
     -------
@@ -134,7 +148,9 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val,
         International Conference on Machine Learning (ICML). 2019.
     """
     if armijo:
-        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, Mi, f_val)
+        alpha, fc, f_val = line_search_armijo(
+            cost, G, deltaG, Mi, f_val, alpha_min=alpha_min, alpha_max=alpha_max
+        )
     else:  # requires symetric matrices
         G, deltaG, C1, C2, constC, M = list_to_array(G, deltaG, C1, C2, constC, M)
         if isinstance(M, int) or isinstance(M, float):
@@ -148,6 +164,8 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val,
         c = cost(G)
 
         alpha = solve_1d_linesearch_quad(a, b, c)
+        if alpha_min is not None or alpha_max is not None:
+            alpha = np.clip(alpha, alpha_min, alpha_max)
         fc = None
         f_val = cost(G + alpha * deltaG)
 
@@ -272,9 +290,10 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000,
         deltaG = Gc - G
 
         # line search
-        alpha, fc, f_val = solve_linesearch(cost, G, deltaG, Mi, f_val, reg=reg, M=M, Gc=Gc, **kwargs)
-        if alpha is None:
-            alpha = 0.0
+        alpha, fc, f_val = solve_linesearch(
+            cost, G, deltaG, Mi, f_val, reg=reg, M=M, Gc=Gc,
+            alpha_min=0., alpha_max=1., **kwargs
+        )
 
         G = G + alpha * deltaG
 
@@ -420,7 +439,9 @@ def gcg(a, b, M, reg1, reg2, f, df, G0=None, numItermax=10,
 
         # line search
         dcost = Mi + reg1 * (1 + nx.log(G))  # ??
-        alpha, fc, f_val = line_search_armijo(cost, G, deltaG, dcost, f_val)
+        alpha, fc, f_val = line_search_armijo(
+            cost, G, deltaG, dcost, f_val, alpha_min=0., alpha_max=1.
+        )
 
         G = G + alpha * deltaG
 
diff --git a/ot/plot.py b/ot/plot.py
index 3e3bed7..2208c90 100644
--- a/ot/plot.py
+++ b/ot/plot.py
@@ -18,7 +18,7 @@ from matplotlib import gridspec
 
 
 def plot1D_mat(a, b, M, title=''):
-    """ Plot matrix :math:`\mathbf{M}`  with the source and target 1D distribution
+    r""" Plot matrix :math:`\mathbf{M}`  with the source and target 1D distribution
 
     Creates a subplot with the source distribution :math:`\mathbf{a}` on the left and
     target distribution :math:`\mathbf{b}` on the top. The matrix :math:`\mathbf{M}` is shown in between.
@@ -61,7 +61,7 @@ def plot1D_mat(a, b, M, title=''):
 
 
 def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs):
-    """ Plot matrix :math:`\mathbf{G}` in 2D with lines using alpha values
+    r""" Plot matrix :math:`\mathbf{G}` in 2D with lines using alpha values
 
     Plot lines between source and target 2D samples with a color
     proportional to the value of the matrix :math:`\mathbf{G}` between samples.
diff --git a/ot/utils.py b/ot/utils.py
index c878563..e6c93c8 100644
--- a/ot/utils.py
+++ b/ot/utils.py
@@ -182,7 +182,7 @@ def euclidean_distances(X, Y, squared=False):
     return c
 
 
-def dist(x1, x2=None, metric='sqeuclidean', p=2):
+def dist(x1, x2=None, metric='sqeuclidean', p=2, w=None):
     r"""Compute distance between samples in :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}`
 
     .. note:: This function is backend-compatible and will work on arrays
@@ -202,6 +202,10 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2):
         'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
         'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
         'sokalmichener', 'sokalsneath', 'sqeuclidean', 'wminkowski', 'yule'.
+    p : float, optional
+        p-norm for the Minkowski and the Weighted Minkowski metrics. Default value is 2.
+    w : array-like, rank 1
+        Weights for the weighted metrics.
 
 
     Returns
@@ -221,7 +225,9 @@ def dist(x1, x2=None, metric='sqeuclidean', p=2):
         if not get_backend(x1, x2).__name__ == 'numpy':
             raise NotImplementedError()
         else:
-            return cdist(x1, x2, metric=metric, p=p)
+            if metric.endswith("minkowski"):
+                return cdist(x1, x2, metric=metric, p=p, w=w)
+            return cdist(x1, x2, metric=metric, w=w)
 
 
 def dist0(n, method='lin_square'):