8 files changed, 690 insertions, 208 deletions
diff --git a/docs/source/_templates/versions.html b/docs/source/_templates/versions.html
new file mode 100644
index 0000000..10d60d7
--- /dev/null
+++ b/docs/source/_templates/versions.html
@@ -0,0 +1,43 @@
+<div class="rst-versions shift-up" data-toggle="rst-versions" role="note" aria-label="versions">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      <span class="fa fa-book"> Python Optimal Transport</span>
+      versions
+      <span class="fa fa-caret-down"></span>
+    </span>
+    <div class="rst-other-versions"><!-- Inserted RTD Footer -->
+
+<div class="injected">
+
+      
+      
+      <dl>
+        <dt>Versions</dt>
+        
+        <dd><a href="https://pythonot.github.io/master">latest</a></dd>
+       
+        <dd><a href="https://pythonot.github.io/">stable</a></dd>
+        
+      </dl>
+      
+
+    
+      
+      <dl>
+        <dt>On GitHub</dt>
+        <dd>
+          <a href="https://github.com/PythonOT/POT">Code on Github</a>
+        </dd>
+        
+      </dl>
+      
+    
+      
+      
+
+      <hr>
+      
+
+
+</div>
+</div>
+  </div>
+\ No newline at end of file
diff --git a/docs/source/all.rst b/docs/source/all.rst
index d7b878f..6a07599 100644
--- a/docs/source/all.rst
+++ b/docs/source/all.rst
@@ -14,6 +14,7 @@ API and modules
    :template: module.rst
 
    lp
+   backend
    bregman
    smooth
    gromov
@@ -27,6 +28,7 @@ API and modules
    stochastic
    unbalanced
    partial
+   sliced
 
 .. autosummary::
    :toctree: ../modules/generated/
diff --git a/docs/source/auto_examples/images/bak.png b/docs/source/auto_examples/images/bak.png
new file mode 100644
index 0000000..25e7e8e
--- /dev/null
+++ b/docs/source/auto_examples/images/bak.png
diff --git a/docs/source/auto_examples/images/sinkhorn.png b/docs/source/auto_examples/images/sinkhorn.png
new file mode 100644
index 0000000..e003e13
--- /dev/null
+++ b/docs/source/auto_examples/images/sinkhorn.png
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 384bf40..9b5a719 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -92,7 +92,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'POT Python Optimal Transport'
-copyright = u'2016-2020, Rémi Flamary, Nicolas Courty'
+copyright = u'2016-2021, Rémi Flamary, Nicolas Courty'
 author = u'Rémi Flamary, Nicolas Courty'
 
 # The version info for the project you're documenting, acts as replacement for
@@ -162,7 +162,7 @@ html_theme = 'sphinx_rtd_theme'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
 #html_theme_path = []
@@ -337,7 +337,8 @@ texinfo_documents = [
 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
                        'numpy': ('http://docs.scipy.org/doc/numpy/', None),
                        'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None),
-                       'matplotlib': ('http://matplotlib.org/', None)}
+                       'matplotlib': ('http://matplotlib.org/', None),
+                       'torch': ('https://pytorch.org/docs/stable/', None)}
 
 sphinx_gallery_conf = {
     'examples_dirs': ['../../examples', '../../examples/da'],
@@ -345,6 +346,7 @@ sphinx_gallery_conf = {
     'backreferences_dir':  'gen_modules/backreferences',
 	'inspect_global_variables'  : True,
     'doc_module'          : ('ot','numpy','scipy','pylab'),
+    'matplotlib_animations': True,
     'reference_url': {
 	'ot': None}
 }
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index d56f812..232df7b 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -7,19 +7,175 @@ to use for different problems related to optimal transport (OT) and machine
 learning. We refer when we can to concrete examples in the documentation that
 are also available as notebooks on the POT Github.
 
-This document is not a tutorial on numerical optimal transport. For this we strongly
-recommend to read the very nice book [15]_ .
+.. note::
+
+    For a  good introduction to numerical optimal transport we refer the reader
+    to `the book <https://arxiv.org/pdf/1803.00567.pdf>`_ by Peyré and Cuturi
+    [15]_. For more detailed introduction to OT and how it can be used
+    in ML applications we refer the reader to the following `OTML tutorial
+    <https://remi.flamary.com/cours/tuto_otml.html>`_.
+    
+.. note::
+
+    Since version 0.8, POT provides a backend to automatically solve some OT
+    problems independently from the toolbox used by the user (numpy/torch/jax).
+    We provide a discussion about which functions are compatible in section
+    `Backend section <#solving-ot-with-multiple-backends>`_ .
+
+
+Why Optimal Transport ?
+-----------------------
+
+
+When to use OT
+^^^^^^^^^^^^^^
+
+Optimal Transport (OT) is a mathematical  problem introduced by Gaspard Monge in
+1781 that aim at finding the most efficient way to move mass between
+distributions. The cost of moving a unit of mass between two positions is called
+the ground cost and the objective is to minimize the overall cost of moving one
+mass distribution onto another one. The optimization problem can be expressed
+for two distributions :math:`\mu_s` and :math:`\mu_t` as
+
+.. math:: 
+    \min_{m, m \# \mu_s = \mu_t} \int c(x,m(x))d\mu_s(x) ,
+
+where :math:`c(\cdot,\cdot)` is the ground cost and the constraint
+:math:`m \# \mu_s = \mu_t`  ensures that  :math:`\mu_s` is completely transported to :math:`\mu_t`.
+This problem is particularly difficult to solve because of this constraint and
+has been replaced in practice (on discrete distributions) by a
+linear program easier to solve. It corresponds to the Kantorovitch formulation
+where the Monge mapping :math:`m` is replaced by a joint distribution
+(OT matrix expressed in the next section) (see :ref:`kantorovitch_solve`). 
+
+From the optimization problem above we can see that there are two main aspects
+to the OT solution that can be used in practical applications:
+
+- The optimal value (Wasserstein distance): Measures similarity between distributions.
+- The optimal mapping (Monge mapping, OT matrix): Finds correspondences between distributions.
+
+
+In the first case, OT can be used to measure similarity between distributions
+(or datasets), in this case the Wasserstein distance (the optimal value of the
+problem) is used. In the second case one can be interested in the way the mass
+is moved between the distributions (the mapping). This mapping can then be used
+to transfer knowledge between distributions.
+
+
+Wasserstein distance between distributions
+""""""""""""""""""""""""""""""""""""""""""
+
+OT is often used to measure similarity between distributions, especially
+when they do not share the same support.  When the support between the
+distributions is disjoint OT-based Wasserstein distances compare  favorably to
+popular f-divergences including the popular Kullback-Leibler, Jensen-Shannon
+divergences, and the Total Variation distance. What is particularly interesting
+for data science applications is that one can compute meaningful sub-gradients
+of the Wasserstein distance. For these reasons it became a very efficient tool
+for machine learning applications that need to measure and optimize similarity
+between empirical distributions.
+
+
+Numerous contributions make use of this an approach is the machine learning (ML)
+literature. For example OT was used for training `Generative
+Adversarial Networks (GANs) <https://arxiv.org/pdf/1701.07875.pdf>`_
+in order to overcome the vanishing gradient problem. It has also
+been used to find `discriminant <https://arxiv.org/pdf/1608.08063.pdf>`_ or
+`robust <https://arxiv.org/pdf/1901.08949.pdf>`_ subspaces for a dataset. The
+Wasserstein distance has also been used to measure `similarity between word
+embeddings of documents <http://proceedings.mlr.press/v37/kusnerb15.pdf>`_ or
+between `signals
+<https://www.math.ucdavis.edu/~saito/data/acha.read.s19/kolouri-etal_optimal-mass-transport.pdf>`_
+or `spectra <https://arxiv.org/pdf/1609.09799.pdf>`_. 
+
+
+
+OT for mapping estimation
+"""""""""""""""""""""""""
+
+A very interesting aspect of OT problem is the OT mapping in itself. When
+computing optimal transport between discrete distributions one output is the OT
+matrix that will provide you with correspondences between the samples in each
+distributions.
+
+
+This correspondence is estimated with respect to the OT criterion and is found
+in a non-supervised way, which makes it very interesting on problems of transfer
+between datasets. It has been used to perform
+`color transfer between images <https://arxiv.org/pdf/1307.5551.pdf>`_ or in
+the context of `domain adaptation <https://arxiv.org/pdf/1507.00504.pdf>`_.
+More recent applications include the use of extension of OT (Gromov-Wasserstein)
+to find correspondences between languages in `word embeddings
+<https://arxiv.org/pdf/1809.00013.pdf>`_.
+
+
+When to use POT
+^^^^^^^^^^^^^^^
+
+
+The main objective of POT is to provide OT solvers for the rapidly growing area
+of OT in the context of machine learning. To this end we implement a number of
+solvers that have been proposed in research papers. Doing so we aim to promote
+reproducible research and foster novel developments.
+
+
+One very important aspect of POT is its ability to be easily extended. For
+instance we provide a very generic OT solver :any:`ot.optim.cg` that can solve
+OT problems with any smooth/continuous regularization term making it
+particularly practical for research purpose. Note that this generic solver has
+been used to solve both graph Laplacian regularization OT and Gromov
+Wasserstein [30]_.
+
+
+.. note::
+
+    POT is originally designed to solve OT problems with Numpy interface and
+    is not yet compatible with Pytorch API. We are currently working on a torch
+    submodule that will provide OT solvers and losses for the most common deep
+    learning configurations.
+
+
+When not to use POT
+"""""""""""""""""""
+
+While POT has to the best of our knowledge one of the most efficient exact OT
+solvers, it has not been designed to handle large scale OT problems. For
+instance the memory cost for an OT problem is always :math:`\mathcal{O}(n^2)` in
+memory because the cost matrix has to be computed. The exact solver in of time
+complexity :math:`\mathcal{O}(n^3\log(n))` and the Sinkhorn solver has been
+proven to be nearly :math:`\mathcal{O}(n^2)` which is still too complex for very
+large scale solvers.
+
+
+If you need to solve OT with large number of samples, we recommend to use
+entropic regularization and memory efficient implementation of Sinkhorn as
+proposed in `GeomLoss <https://www.kernel-operations.io/geomloss/>`_. This
+implementation is compatible with Pytorch and can handle large number of
+samples. Another approach to estimate the Wasserstein distance for very large
+number of sample is to use the trick from `Wasserstein GAN
+<https://arxiv.org/pdf/1701.07875.pdf>`_ that solves the problem
+in the dual with a neural network estimating the dual variable. Note that in this
+case you are only solving an approximation of the Wasserstein distance because
+the 1-Lipschitz constraint on the dual cannot be enforced exactly (approximated
+through filter thresholding or regularization). Finally note that in order to
+avoid solving large scale OT problems, a number of recent approached minimized
+the expected Wasserstein distance on minibtaches that is different from the
+Wasserstein but has better computational and
+`statistical properties <https://arxiv.org/pdf/1910.04091.pdf>`_.
 
 
 Optimal transport and Wasserstein distance
 ------------------------------------------
 
 .. note::
+
     In POT, most functions that solve OT or regularized OT problems have two
     versions that return the OT matrix or the value of the optimal solution. For
-    instance :any:`ot.emd` return the OT matrix and :any:`ot.emd2` return the
+    instance :any:`ot.emd` returns the OT matrix and :any:`ot.emd2` returns the
     Wassertsein distance. This approach has been implemented in practice for all
-    solvers that return an OT matrix (even Gromov-Wasserstsein)
+    solvers that return an OT matrix (even Gromov-Wasserstsein).
+
+.. _kantorovitch_solve:
 
 Solving optimal transport
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -28,30 +184,31 @@ The optimal transport problem between discrete distributions is often expressed
 as
 
 .. math::
-    \gamma^* = arg\min_\gamma \quad \sum_{i,j}\gamma_{i,j}M_{i,j}
+    \gamma^* = arg\min_{\gamma \in \mathbb{R}_+^{m\times n}} \quad \sum_{i,j}\gamma_{i,j}M_{i,j}
 
     s.t. \gamma 1 = a; \gamma^T 1= b; \gamma\geq 0
 
-where :
+where:
 
-- :math:`M\in\mathbb{R}_+^{m\times n}` is the metric cost matrix defining the cost to move mass from bin :math:`a_i` to bin :math:`b_j`.
-- :math:`a` and :math:`b` are histograms on the simplex (positive, sum to 1) that represent the
-weights of each samples in the source an target distributions.
+  - :math:`M\in\mathbb{R}_+^{m\times n}` is the metric cost matrix defining the cost to move mass from bin :math:`a_i` to bin :math:`b_j`.
+
+  - :math:`a` and :math:`b` are histograms on the simplex (positive, sum to 1) that represent the weights of each samples in the source an target distributions.
 
 Solving the linear program above can be done using the function :any:`ot.emd`
 that will return the optimal transport matrix :math:`\gamma^*`:
 
 .. code:: python
 
-    # a,b are 1D histograms (sum to 1 and positive)
+    # a and b are 1D histograms (sum to 1 and positive)
     # M is the ground cost matrix
-    T=ot.emd(a,b,M) # exact linear program
+    T = ot.emd(a, b, M)  # exact linear program
 
-The method implemented for solving the OT problem is the network simplex, it is
-implemented in C from  [1]_. It has a complexity of :math:`O(n^3)` but the
+The method implemented for solving the OT problem is the network simplex. It is
+implemented in C from [1]_. It has a complexity of :math:`O(n^3)` but the
 solver is quite efficient and uses sparsity of the solution.
 
 .. hint::
+
     Examples of use for :any:`ot.emd` are available in :
 
     - :any:`auto_examples/plot_OT_2D_samples`
@@ -62,10 +219,11 @@ solver is quite efficient and uses sparsity of the solution.
 Computing Wasserstein distance
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The value of the OT solution is often more of interest than the OT matrix :
+The value of the OT solution is often more interesting than the OT matrix:
 
 .. math::
-    OT(a,b)=\min_\gamma \quad \sum_{i,j}\gamma_{i,j}M_{i,j}
+
+    OT(a,b) = \min_{\gamma \in \mathbb{R}_+^{m\times n}} \quad \sum_{i,j}\gamma_{i,j}M_{i,j}
 
     s.t. \gamma 1 = a; \gamma^T 1= b; \gamma\geq 0
 
@@ -75,9 +233,9 @@ It can computed from an already estimated OT matrix with
 
 .. code:: python
 
-    # a,b are 1D histograms (sum to 1 and positive)
+    # a and b are 1D histograms (sum to 1 and positive)
     # M is the ground cost matrix
-    W=ot.emd2(a,b,M) # Wasserstein distance / EMD value
+    W = ot.emd2(a, b, M)  # Wasserstein distance / EMD value
 
 Note that the well known  `Wasserstein distance
 <https://en.wikipedia.org/wiki/Wasserstein_metric>`_ between distributions a and
@@ -86,19 +244,19 @@ b is defined as
 
     .. math::
 
-        W_p(a,b)=(\min_\gamma \sum_{i,j}\gamma_{i,j}\|x_i-y_j\|_p)^\frac{1}{p}
+        W_p(a,b)=(\min_{\gamma \in \mathbb{R}_+^{m\times n}} \sum_{i,j}\gamma_{i,j}\|x_i-y_j\|_p)^\frac{1}{p}
 
         s.t. \gamma 1 = a; \gamma^T 1= b; \gamma\geq 0
 
 This means that if you want to compute the :math:`W_2` you need to compute the
 square root of :any:`ot.emd2` when providing
-:code:`M=ot.dist(xs,xt)` that use the squared euclidean distance by default. Computing
-the :math:`W_1` wasserstein distance can be done directly with  :any:`ot.emd2`
-when providing :code:`M=ot.dist(xs,xt, metric='euclidean')` to use the euclidean
+:code:`M = ot.dist(xs, xt)`, that uses the squared euclidean distance by default. Computing
+the :math:`W_1` Wasserstein distance can be done directly with :any:`ot.emd2`
+when providing :code:`M = ot.dist(xs, xt, metric='euclidean')` to use the Euclidean
 distance.
 
-
 .. hint::
+
     An example of use for :any:`ot.emd2` is available in :
 
     - :any:`auto_examples/plot_compute_emd`
@@ -123,9 +281,9 @@ Another special case for estimating OT and Monge mapping is between Gaussian
 distributions. In this case there exists a close form solution given in Remark
 2.29 in [15]_ and the Monge mapping is an affine function and can be
 also computed from the covariances and means of the source and target
-distributions. In the case when the finite sample dataset is supposed gaussian, we provide
-:any:`ot.da.OT_mapping_linear` that returns the parameters for the Monge
-mapping.
+distributions. In the case when the finite sample dataset is supposed Gaussian,
+we provide :any:`ot.da.OT_mapping_linear` that returns the parameters for the
+Monge mapping.
 
 
 Regularized Optimal Transport
@@ -136,7 +294,7 @@ computational and statistical properties.
 We address in this section the regularized OT problems that can be expressed as
 
 .. math::
-    \gamma^* = arg\min_\gamma \quad \sum_{i,j}\gamma_{i,j}M_{i,j} + \lambda\Omega(\gamma)
+    \gamma^* = arg\min_{\gamma \in \mathbb{R}_+^{m\times n}} \quad \sum_{i,j}\gamma_{i,j}M_{i,j} + \lambda\Omega(\gamma)
 
         s.t. \gamma 1 = a; \gamma^T 1= b; \gamma\geq 0
 
@@ -175,8 +333,8 @@ solution of the resulting optimization problem can be expressed as:
 
 where :math:`u` and :math:`v` are vectors and :math:`K=\exp(-M/\lambda)` where
 the :math:`\exp` is taken component-wise. In order to solve the optimization
-problem, on can use an alternative projection algorithm called Sinkhorn-Knopp that can be very
-efficient for large values if regularization.
+problem, one can use an alternative projection algorithm called Sinkhorn-Knopp
+that can be very efficient for large values of regularization.
 
 The Sinkhorn-Knopp algorithm is implemented in :any:`ot.sinkhorn` and
 :any:`ot.sinkhorn2` that return respectively the OT matrix and the value of the
@@ -184,10 +342,10 @@ linear term. Note that the regularization parameter :math:`\lambda` in the
 equation above is given to those functions with the parameter :code:`reg`.
 
     >>> import ot
-    >>> a=[.5,.5]
-    >>> b=[.5,.5]
-    >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.sinkhorn(a,b,M,1)
+    >>> a = [.5, .5]
+    >>> b = [.5, .5]
+    >>> M = [[0., 1.], [1., 0.]]
+    >>> ot.sinkhorn(a, b, M, 1)
     array([[ 0.36552929,  0.13447071],
         [ 0.13447071,  0.36552929]])
 
@@ -195,20 +353,27 @@ More details about the algorithms used are given in the following note.
 
 .. note::
     The main function to solve entropic regularized OT is :any:`ot.sinkhorn`.
-    This function is a wrapper and the parameter :code:`method` help you select
+    This function is a wrapper and the parameter :code:`method` allows you to select
     the actual algorithm used to solve the problem:
 
     + :code:`method='sinkhorn'` calls :any:`ot.bregman.sinkhorn_knopp`  the
       classic algorithm [2]_.
+    + :code:`method='sinkhorn_log'` calls :any:`ot.bregman.sinkhorn_log`  the
+      sinkhorn algorithm in log space [2]_ that is more stable but can be
+      slower in numpy since `logsumexp` is not implmemented in parallel. 
+      It is the recommended solver for applications that requires
+      differentiability with a  small number of iterations.
     + :code:`method='sinkhorn_stabilized'` calls :any:`ot.bregman.sinkhorn_stabilized`  the
       log stabilized version of the algorithm [9]_.
     + :code:`method='sinkhorn_epsilon_scaling'` calls
       :any:`ot.bregman.sinkhorn_epsilon_scaling`  the epsilon scaling version
       of the algorithm [9]_.
     + :code:`method='greenkhorn'` calls :any:`ot.bregman.greenkhorn`  the
-      greedy sinkhorn verison of the algorithm [22]_.
+      greedy Sinkhorn version of the algorithm [22]_.
+    + :code:`method='screenkhorn'` calls :any:`ot.bregman.screenkhorn`  the
+      screening sinkhorn version of the algorithm [26]_.
 
-    In addition to all those variants of sinkhorn, we have another
+    In addition to all those variants of Sinkhorn, we have another
     implementation solving the problem in the smooth dual or semi-dual in
     :any:`ot.smooth`. This solver uses the :any:`scipy.optimize.minimize`
     function to solve the smooth problem with :code:`L-BFGS-B` algorithm. Tu use
@@ -216,12 +381,31 @@ More details about the algorithms used are given in the following note.
     :any:`ot.smooth.smooth_ot_semi_dual` with parameter :code:`reg_type='kl'` to
     choose entropic/Kullbach Leibler regularization.
 
+    **Choosing a Sinkhorn solver**
+
+    By default and when using a regularization parameter that is not too small
+    the default Sinkhorn solver should be enough. If you need to use a small
+    regularization to get sharper OT matrices, you should use the
+    :any:`ot.bregman.sinkhorn_stabilized` solver that will avoid numerical
+    errors. This last solver can be very slow in practice and might not even
+    converge to a reasonable OT matrix in a finite time. This is why
+    :any:`ot.bregman.sinkhorn_epsilon_scaling` that relie on iterating the value
+    of the regularization (and using warm start) sometimes leads to better
+    solutions. Note that the greedy version of the Sinkhorn
+    :any:`ot.bregman.greenkhorn` can also lead to a speedup and the screening
+    version of the Sinkhorn :any:`ot.bregman.screenkhorn` aim a providing a
+    fast approximation of the Sinkhorn problem. For use of GPU and gradient
+    computation with small number of iterations we strongly recommend the 
+    :any:`ot.bregman.sinkhorn_log` solver that will no need to check for 
+    numerical problems.
 
-Recently [23]_ introduced the sinkhorn divergence that build from entropic
+
+
+Recently Genevay et al. [23]_ introduced the Sinkhorn divergence that build from entropic
 regularization to compute fast and differentiable geometric divergence between
-empirical distributions.  Note that we provide a function that compute directly
-(with no need to pre compute the :code:`M` matrix)
-the sinkhorn divergence for empirical distributions in
+empirical distributions.  Note that we provide a function that computes directly
+(with no need to precompute the :code:`M` matrix)
+the Sinkhorn divergence for empirical distributions in
 :any:`ot.bregman.empirical_sinkhorn_divergence`. Similarly one can compute the
 OT matrix and loss for empirical distributions with respectively
 :any:`ot.bregman.empirical_sinkhorn` and :any:`ot.bregman.empirical_sinkhorn2`.
@@ -229,7 +413,7 @@ OT matrix and loss for empirical distributions with respectively
 
 Finally note that we also provide in :any:`ot.stochastic` several implementation
 of stochastic solvers for entropic regularized OT [18]_ [19]_.  Those pure Python
-implementations are not optimized for speed but provide a roust implementation
+implementations are not optimized for speed but provide a robust implementation
 of algorithms in [18]_ [19]_.
 
 .. hint::
@@ -244,11 +428,11 @@ of algorithms in [18]_ [19]_.
 Other regularization
 ^^^^^^^^^^^^^^^^^^^^
 
-While entropic OT is the most common and favored in practice, there exist other
-kind of regularization. We provide in POT two specific solvers for other
-regularization terms, namely quadratic regularization and group lasso
-regularization. But we also provide in :any:`ot.optim`  two generic solvers that allows solving any
-smooth regularization in practice.
+While entropic OT is the most common and favored in practice, there exists other
+kinds of regularizations. We provide in POT two specific solvers for other
+regularization terms, namely quadratic regularization and group Lasso
+regularization. But we also provide in :any:`ot.optim`  two generic solvers
+that allows solving any smooth regularization in practice.
 
 Quadratic regularization
 """"""""""""""""""""""""
@@ -259,8 +443,8 @@ regularization of the form
 .. math::
     \Omega(\gamma)=\sum_{i,j} \gamma_{i,j}^2
 
-this regularization term has a similar effect to entropic regularization in
-densifying the OT matrix but it keeps some sort of sparsity that is lost with
+This regularization term has an effect similar to entropic regularization by
+densifying the OT matrix, yet it keeps some sort of sparsity that is lost with
 entropic regularization as soon as :math:`\lambda>0` [17]_. This problem can be
 solved with POT using solvers from :any:`ot.smooth`, more specifically
 functions :any:`ot.smooth.smooth_ot_dual` or
@@ -278,30 +462,29 @@ choose the quadratic regularization.
 Group Lasso regularization
 """"""""""""""""""""""""""
 
-Another regularization that has been used in recent years [5]_  is the group lasso
+Another regularization that has been used in recent years [5]_ is the group Lasso
 regularization
 
 .. math::
     \Omega(\gamma)=\sum_{j,G\in\mathcal{G}} \|\gamma_{G,j}\|_q^p
 
-where :math:`\mathcal{G}` contains non overlapping groups of lines in the OT
-matrix. This regularization proposed in [5]_ will promote sparsity at the group level and for
+where :math:`\mathcal{G}` contains non-overlapping groups of lines in the OT
+matrix. This regularization proposed in [5]_ promotes sparsity at the group level and for
 instance will force target samples to get mass from a small number of groups.
 Note that the exact OT solution is already sparse so this regularization does
-not make sens if it is not combined with entropic regularization. Depending on
+not make sense if it is not combined with entropic regularization. Depending on
 the choice of :code:`p` and :code:`q`, the problem can be solved with different
-approaches.  When :code:`q=1` and :code:`p<1` the problem is non convex but can
+approaches.  When :code:`q=1` and :code:`p<1` the problem is non-convex but can
 be solved using an efficient majoration minimization approach with
 :any:`ot.sinkhorn_lpl1_mm`. When :code:`q=2` and :code:`p=1` we recover the
 convex group lasso and we provide a solver using generalized conditional
-gradient algorithm [7]_ in function
-:any:`ot.da.sinkhorn_l1l2_gl`.
+gradient algorithm [7]_ in function :any:`ot.da.sinkhorn_l1l2_gl`.
 
 .. hint::
-    Examples of group Lasso regularization are available in :
+    Examples of group Lasso regularization are available in:
 
-    - :any:`auto_examples/plot_otda_classes`
-    - :any:`auto_examples/plot_otda_d2`
+    - :any:`auto_examples/domain-adaptation/plot_otda_classes`
+    - :any:`auto_examples/domain-adaptation/plot_otda_d2`
 
 
 Generic solvers
@@ -322,11 +505,10 @@ you can use function :any:`ot.optim.cg` that will use a conditional gradient as
 proposed in [6]_ . You need to provide the regularization function as parameter
 ``f`` and its gradient as parameter  ``df``. Note that the conditional gradient relies on
 iterative solving of a linearization of the problem using the exact
-:any:`ot.emd` so it can be  slow in practice. But, being an interior point
-algorithm,  it always returns a
-transport matrix that does not violates the marginals.
+:any:`ot.emd` so it can be quite slow in practice. However, being an interior point
+algorithm, it always returns a transport matrix that does not violates the marginals.
 
-Another generic solver is proposed to solve the problem
+Another generic solver is proposed to solve the problem:
 
 .. math::
     \gamma^* = arg\min_\gamma \quad \sum_{i,j}\gamma_{i,j}M_{i,j}+ \lambda_e\Omega_e(\gamma) + \lambda\Omega(\gamma)
@@ -347,7 +529,7 @@ relies on :any:`ot.sinkhorn` for its iterations.
 Wasserstein Barycenters
 -----------------------
 
-A Wasserstein barycenter is a distribution that minimize its Wasserstein
+A Wasserstein barycenter is a distribution that minimizes its Wasserstein
 distance with respect to other distributions [16]_. It corresponds to minimizing the
 following problem by searching a distribution :math:`\mu` such that
 
@@ -378,18 +560,18 @@ be expressed as
 
 where :math:`b_k` are also weights in the simplex. In the non-regularized case,
 the problem above is a classical linear program. In this case we propose a
-solver :any:`ot.lp.barycenter` that rely on generic LP solvers. By default the
+solver :meth:`ot.lp.barycenter` that relies on generic LP solvers. By default the
 function uses :any:`scipy.optimize.linprog`, but more efficient LP solvers from
 cvxopt can be also used by changing parameter :code:`solver`. Note that this problem
 requires to solve a very large linear program and can be very slow in
 practice.
 
 Similarly to the OT problem, OT barycenters can be computed in the regularized
-case. When using entropic regularization is used, the problem can be solved with a
-generalization of the sinkhorn algorithm based on bregman projections [3]_. This
+case. When entropic regularization is used, the problem can be solved with a
+generalization of the Sinkhorn algorithm based on Bregman projections [3]_. This
 algorithm is provided in function :any:`ot.bregman.barycenter` also available as
 :any:`ot.barycenter`. In this case, the algorithm scales better to large
-distributions and rely only on matrix multiplications that can be performed in
+distributions and relies only on matrix multiplications that can be performed in
 parallel.
 
 In addition to the speedup brought by regularization, one can also greatly
@@ -400,18 +582,18 @@ operators. We provide an implementation of this algorithm in function
 :any:`ot.bregman.convolutional_barycenter2d`.
 
 .. hint::
-    Examples of Wasserstein (:any:`ot.lp.barycenter`) and regularized Wasserstein
+    Examples of Wasserstein (:meth:`ot.lp.barycenter`) and regularized Wasserstein
     barycenter (:any:`ot.bregman.barycenter`) computation are available in :
 
-    - :any:`auto_examples/plot_barycenter_1D`
-    - :any:`auto_examples/plot_barycenter_lp_vs_entropic`
+    - :any:`auto_examples/barycenters/plot_barycenter_1D`
+    - :any:`auto_examples/barycenters/plot_barycenter_lp_vs_entropic`
 
     An example of convolutional barycenter
     (:any:`ot.bregman.convolutional_barycenter2d`) computation
     for 2D images is available
     in :
 
-    - :any:`auto_examples/plot_convolutional_barycenter`
+    - :any:`auto_examples/barycenters/plot_convolutional_barycenter`
 
 
 
@@ -419,7 +601,7 @@ Barycenters with free support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Estimating the Wasserstein barycenter with free support but fixed weights
-corresponds to  solving the following optimization problem:
+corresponds to solving the following optimization problem:
 
 .. math::
     \min_{\{x_i\}} \quad \sum_{k} w_kW(\mu,\mu_k)
@@ -436,7 +618,7 @@ return a locally optimal support :math:`\{x_i\}` for uniform or given weights
     An example of the free support barycenter estimation is available
     in :
 
-    - :any:`auto_examples/plot_free_support_barycenter`
+    - :any:`auto_examples/barycenters/plot_free_support_barycenter`
 
 
 
@@ -444,7 +626,7 @@ return a locally optimal support :math:`\{x_i\}` for uniform or given weights
 Monge mapping and Domain adaptation
 -----------------------------------
 
-The original transport problem investigated by Gaspard Monge  was seeking for a
+The original transport problem investigated by Gaspard Monge was seeking for a
 mapping function that maps (or transports) between a source and target
 distribution but that minimizes the transport loss. The existence and uniqueness of this
 optimal mapping is still an open problem in the general case but has been proven
@@ -462,24 +644,24 @@ approximate a Monge mapping from finite distributions.
 First note that when the source and target distributions are supposed to be Gaussian
 distributions, there exists a close form solution for the mapping and its an
 affine function [14]_ of the form :math:`T(x)=Ax+b` . In this case we provide the function
-:any:`ot.da.OT_mapping_linear` that return the operator :math:`A` and vector
+:any:`ot.da.OT_mapping_linear` that returns the operator :math:`A` and vector
 :math:`b`. Note that if the number of samples is too small there is a parameter
-:code:`reg` that provide a regularization for the covariance matrix estimation.
+:code:`reg` that provides a regularization for the covariance matrix estimation.
 
 For a more general mapping estimation we also provide the barycentric mapping
-proposed in [6]_ . It is implemented in the class :any:`ot.da.EMDTransport` and
-other transport based classes in :any:`ot.da` . Those classes are discussed more
-in the following but follow an interface similar to sklearn classes. Finally a
+proposed in [6]_. It is implemented in the class :any:`ot.da.EMDTransport` and
+other transport-based classes in :any:`ot.da` . Those classes are discussed more
+in the following but follow an interface similar to scikit-learn classes. Finally a
 method proposed in [8]_ that estimates a continuous mapping approximating the
 barycentric mapping is provided in :any:`ot.da.joint_OT_mapping_linear` for
-linear mapping and :any:`ot.da.joint_OT_mapping_kernel` for non linear mapping.
+linear mapping and :any:`ot.da.joint_OT_mapping_kernel` for non-linear mapping.
 
  .. hint::
 
     An example of the linear Monge mapping estimation is available
     in :
 
-    - :any:`auto_examples/plot_otda_linear_mapping`
+    - :any:`auto_examples/domain-adaptation/plot_otda_linear_mapping`
 
 Domain adaptation classes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -491,21 +673,19 @@ transport labeled source samples onto the target distribution with no labels.
 
 We provide several classes based on :any:`ot.da.BaseTransport` that provide
 several OT and mapping estimations. The interface of those classes is similar to
-classifiers in sklearn toolbox. At initialization, several parameters such as
- regularization parameter value can be set. Then one needs to estimate the
+classifiers in scikit-learn. At initialization, several parameters such as
+regularization parameter value can be set. Then one needs to estimate the
 mapping with function :any:`ot.da.BaseTransport.fit`. Finally one can map the
 samples from source to target with  :any:`ot.da.BaseTransport.transform` and
 from target to source with :any:`ot.da.BaseTransport.inverse_transform`.
 
-Here is
-an example for class :any:`ot.da.EMDTransport` :
+Here is an example for class :any:`ot.da.EMDTransport`:
 
 .. code::
 
     ot_emd = ot.da.EMDTransport()
     ot_emd.fit(Xs=Xs, Xt=Xt)
-
-    Mapped_Xs= ot_emd.transform(Xs=Xs)
+    Xs_mapped = ot_emd.transform(Xs=Xs)
 
 A list of the provided implementation is given in the following note.
 
@@ -514,24 +694,24 @@ A list of the provided implementation is given in the following note.
     Here is a list of the OT mapping classes inheriting from
     :any:`ot.da.BaseTransport`
 
-    * :any:`ot.da.EMDTransport` : Barycentric mapping with EMD transport
-    * :any:`ot.da.SinkhornTransport` : Barycentric mapping with Sinkhorn transport
-    * :any:`ot.da.SinkhornL1l2Transport` : Barycentric mapping with Sinkhorn +
+    * :any:`ot.da.EMDTransport`: Barycentric mapping with EMD transport
+    * :any:`ot.da.SinkhornTransport`: Barycentric mapping with Sinkhorn transport
+    * :any:`ot.da.SinkhornL1l2Transport`: Barycentric mapping with Sinkhorn +
       group Lasso regularization [5]_
-    * :any:`ot.da.SinkhornLpl1Transport` : Barycentric mapping with Sinkhorn +
+    * :any:`ot.da.SinkhornLpl1Transport`: Barycentric mapping with Sinkhorn +
       non convex group Lasso regularization [5]_
-    * :any:`ot.da.LinearTransport` : Linear mapping estimation  between Gaussians
+    * :any:`ot.da.LinearTransport`: Linear mapping estimation  between Gaussians
       [14]_
-    * :any:`ot.da.MappingTransport` : Nonlinear mapping estimation [8]_
+    * :any:`ot.da.MappingTransport`: Nonlinear mapping estimation [8]_
 
 .. hint::
 
-    Example of the use of OTDA classes are available in :
+    Examples of the use of OTDA classes are available in:
 
-    - :any:`auto_examples/plot_otda_color_images`
-    - :any:`auto_examples/plot_otda_mapping`
-    - :any:`auto_examples/plot_otda_mapping_colors_images`
-    - :any:`auto_examples/plot_otda_semi_supervised`
+    - :any:`auto_examples/domain-adaptation/plot_otda_color_images`
+    - :any:`auto_examples/domain-adaptation/plot_otda_mapping`
+    - :any:`auto_examples/domain-adaptation/plot_otda_mapping_colors_images`
+    - :any:`auto_examples/domain-adaptation/plot_otda_semi_supervised`
 
 Other applications
 ------------------
@@ -545,14 +725,14 @@ Wasserstein Discriminant Analysis
 Wasserstein Discriminant Analysis [11]_ is a generalization of `Fisher Linear Discriminant
 Analysis <https://en.wikipedia.org/wiki/Linear_discriminant_analysis>`__ that
 allows discrimination between classes that are not linearly separable. It
-consist in finding a linear projector optimizing the following criterion
+consists in finding a linear projector optimizing the following criterion
 
 .. math::
     P = \text{arg}\min_P \frac{\sum_i OT_e(\mu_i\#P,\mu_i\#P)}{\sum_{i,j\neq i}
     OT_e(\mu_i\#P,\mu_j\#P)}
 
 where :math:`\#` is the push-forward operator, :math:`OT_e` is the entropic OT
-loss  and :math:`\mu_i` is the
+loss and :math:`\mu_i` is the
 distribution of samples from class :math:`i`.  :math:`P` is also constrained to
 be in the Stiefel manifold. WDA can be solved in POT using function
 :any:`ot.dr.wda`. It requires to have installed :code:`pymanopt` and
@@ -561,6 +741,7 @@ respectively. Note that we also provide the Fisher discriminant estimator in
 :any:`ot.dr.fda` for easy comparison.
 
 .. warning::
+
     Note that due to the hard dependency on  :code:`pymanopt` and
     :code:`autograd`, :any:`ot.dr` is not imported by default. If you want to
     use it you have to specifically import it with :code:`import ot.dr` .
@@ -569,7 +750,7 @@ respectively. Note that we also provide the Fisher discriminant estimator in
 
     An example of the use of WDA is available in :
 
-    - :any:`auto_examples/plot_WDA`
+    - :any:`auto_examples/others/plot_WDA`
 
 
 Unbalanced optimal transport
@@ -610,7 +791,7 @@ linear term.
 
     Examples of the use of :any:`ot.sinkhorn_unbalanced` are available in :
 
-    - :any:`auto_examples/plot_UOT_1D`
+    - :any:`auto_examples/unbalanced-partial/plot_UOT_1D`
 
 
 Unbalanced Barycenters
@@ -622,17 +803,17 @@ histograms with different masses as a Fréchet Mean:
     .. math::
         \min_{\mu} \quad \sum_{k} w_kW_u(\mu,\mu_k)
 
-Where :math:`W_u` is the unbalanced Wasserstein metric defined above. This problem
+where :math:`W_u` is the unbalanced Wasserstein metric defined above. This problem
 can also be solved using generalized version of Sinkhorn's algorithm and it is
 implemented the main function :any:`ot.barycenter_unbalanced`.
 
 
 .. note::
     The main function to compute UOT barycenters is :any:`ot.barycenter_unbalanced`.
-    This function is a wrapper and the parameter :code:`method` help you select
+    This function is a wrapper and the parameter :code:`method` helps you select
     the actual algorithm used to solve the problem:
 
-    + :code:`method='sinkhorn'` calls :any:`ot.unbalanced.barycenter_unbalanced_sinkhorn_unbalanced`
+    + :code:`method='sinkhorn'` calls :meth:`ot.unbalanced.barycenter_unbalanced_sinkhorn_unbalanced`
       the generalized Sinkhorn algorithm [10]_.
     + :code:`method='sinkhorn_stabilized'` calls :any:`ot.unbalanced.barycenter_unbalanced_stabilized`
       the log stabilized version of the algorithm [10]_.
@@ -642,7 +823,7 @@ implemented the main function :any:`ot.barycenter_unbalanced`.
 
       Examples of the use of :any:`ot.barycenter_unbalanced` are available in :
 
-      - :any:`auto_examples/plot_UOT_barycenter_1D`
+      - :any:`auto_examples/unbalanced-partial/plot_UOT_barycenter_1D`
 
 
 Partial optimal transport
@@ -686,9 +867,9 @@ regularization of the problem.
 
 .. hint::
 
-    Examples of the use of :any:`ot.partial` are available in :
+    Examples of the use of :any:`ot.partial` are available in:
 
-    - :any:`auto_examples/plot_partial`
+    - :any:`auto_examples/unbalanced-partial/plot_partial_wass_and_gromov`
 
 
 
@@ -699,7 +880,7 @@ Gromov Wasserstein (GW) is a generalization of OT to distributions that do not l
 the same space [13]_. In this case one cannot compute distance between samples
 from the two distributions. [13]_ proposed instead to realign the metric spaces
 by computing a transport between distance matrices. The Gromow Wasserstein
-alignement between two distributions can be expressed as the one minimizing:
+alignment between two distributions can be expressed as the one minimizing:
 
 .. math::
     GW = \min_\gamma \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*\gamma_{i,j}*\gamma_{k,l}
@@ -731,8 +912,8 @@ positive matrix. We provide a block coordinate optimization procedure in
 barycenters respectively.
 
 Finally note that recently a fusion between Wasserstein and GW, coined Fused
-Gromov-Wasserstein (FGW) has been proposed
-in [24]_. It allows to compute a similarity between objects that are only partly in
+Gromov-Wasserstein (FGW) has been proposed [24]_.
+It allows to compute a similarity between objects that are only partly in
 the same space. As such it can be used to measure similarity between labeled
 graphs for instance and also provide computable barycenters.
 The implementations of FGW and FGW barycenter is provided in functions
@@ -740,20 +921,27 @@ The implementations of FGW and FGW barycenter is provided in functions
 
 .. hint::
 
-    Examples of computation of GW, regularized G and FGW are available in :
+    Examples of computation of GW, regularized G and FGW are available in:
 
-    - :any:`auto_examples/plot_gromov`
-    - :any:`auto_examples/plot_fgw`
+    - :any:`auto_examples/gromov/plot_gromov`
+    - :any:`auto_examples/gromov/plot_fgw`
 
-    Examples of GW, regularized GW and FGW barycenters are available in :
+    Examples of GW, regularized GW and FGW barycenters are available in:
 
-    - :any:`auto_examples/plot_gromov_barycenter`
-    - :any:`auto_examples/plot_barycenter_fgw`
+    - :any:`auto_examples/gromov/plot_gromov_barycenter`
+    - :any:`auto_examples/gromov/plot_barycenter_fgw`
 
 
 GPU acceleration
 ^^^^^^^^^^^^^^^^
 
+.. warning::
+
+    The :any:`ot.gpu` has been deprecated since the release 0.8 of POT and
+    should not be used. The GPU implementation (in Pytorch for instance) can be
+    used with the novel backends using the compatible functions from POT.
+
+
 We provide several implementation of our OT solvers in :any:`ot.gpu`. Those
 implementations use the :code:`cupy` toolbox that obviously need to be installed.
 
@@ -764,28 +952,80 @@ implementations use the :code:`cupy` toolbox that obviously need to be installed
     algebra) have been implemented in :any:`ot.gpu`. Here is a short list on the
     main entries:
 
-    -  :any:`ot.gpu.dist` : computation of distance matrix
-    -  :any:`ot.gpu.sinkhorn` : computation of sinkhorn
-    -  :any:`ot.gpu.sinkhorn_lpl1_mm` : computation of sinkhorn + group lasso
+    -  :meth:`ot.gpu.dist`: computation of distance matrix
+    -  :meth:`ot.gpu.sinkhorn`: computation of sinkhorn
+    -  :meth:`ot.gpu.sinkhorn_lpl1_mm`: computation of sinkhorn + group lasso
 
 Note that while the :any:`ot.gpu` module has been designed to be compatible with
-POT,  calling its function with :any:`numpy`  arrays will incur a large overhead due to
+POT, calling its function with :any:`numpy`  arrays will incur a large overhead due to
 the memory copy of the array on GPU prior to computation and conversion of the
 array after computation. To avoid this overhead, we provide functions
-:any:`ot.gpu.to_gpu` and :any:`ot.gpu.to_np` that perform the conversion
+:meth:`ot.gpu.to_gpu` and :meth:`ot.gpu.to_np` that perform the conversion
 explicitly.
 
-
 .. warning::
-    Note that due to the hard dependency on  :code:`cupy`, :any:`ot.gpu` is not
+
+    Note that due to the hard dependency on :code:`cupy`, :any:`ot.gpu` is not
     imported by default. If you want to
     use it you have to specifically import it with :code:`import ot.gpu` .
 
 
-FAQ
----
+Solving OT with Multiple backends
+---------------------------------
+
+.. _backends_section:
+
+Since version 0.8, POT provides a backend that allows to code solvers
+independently from the type of the input arrays. The idea is to provide the user
+with a package that works seamlessly and returns a solution for instance as a
+Pytorch tensors when the function has Pytorch tensors as input. 
 
 
+How it works
+^^^^^^^^^^^^
+
+The aim of the backend is to use the same function independently of the type of
+the input arrays.
+
+For instance when executing the following code
+
+.. code:: python
+
+    # a and b are 1D histograms (sum to 1 and positive)
+    # M is the ground cost matrix
+    T = ot.emd(a, b, M)  # exact linear program
+    w = ot.emd2(a, b, M)  # Wasserstein computation
+
+the functions  :any:`ot.emd` and :any:`ot.emd2` can take inputs of the type
+:any:`numpy.array`, :any:`torch.tensor` or  :any:`jax.numpy.array`. The output of
+the function will be the same type as the inputs and on the same device. When
+possible all computations are done on the same device and also when possible the
+output will be differentiable with respect to the input of the function.
+
+
+
+List of compatible Backends
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- `Numpy <https://numpy.org/>`_ (all functions and solvers)
+- `Pytorch <https://pytorch.org/>`_ (all outputs differentiable w.r.t. inputs)
+- `Jax <https://github.com/google/jax>`_ (Some functions are differentiable some require a wrapper)
+
+List of compatible functions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This list will get longer for new releases and will hopefully disappear when POT
+become fully implemented with the backend.
+
+- :any:`ot.emd`
+- :any:`ot.emd2`
+- :any:`ot.sinkhorn`
+- :any:`ot.sinkhorn2`
+- :any:`ot.dist`
+
+
+FAQ
+---
 
 1. **How to solve a discrete optimal transport problem ?**
 
@@ -798,10 +1038,10 @@ FAQ
 
     .. code:: python
 
-       # a,b are 1D histograms (sum to 1 and positive)
+       # a and b are 1D histograms (sum to 1 and positive)
        # M is the ground cost matrix
-       T=ot.emd(a,b,M) # exact linear program
-       T_reg=ot.sinkhorn(a,b,M,reg) # entropic regularized OT
+       T = ot.emd(a, b, M)  # exact linear program
+       T_reg = ot.sinkhorn(a, b, M, reg)  # entropic regularized OT
 
     More detailed examples can be seen on this example:
     :doc:`auto_examples/plot_OT_2D_samples`
@@ -823,15 +1063,15 @@ FAQ
 3. **Why is Sinkhorn slower than EMD ?**
 
     This might come from the choice of the regularization term. The speed of
-    convergence of sinkhorn depends directly on this term [22]_ and when the
-    regularization gets very small the problem try and approximate the exact OT
+    convergence of Sinkhorn depends directly on this term [22]_. When the
+    regularization gets very small the problem tries to approximate the exact OT
     which leads to slow convergence in addition to numerical problems. In other
-    words, for large regularization sinkhorn will be very fast to converge, for
+    words, for large regularization Sinkhorn will be very fast to converge, for
     small regularization (when you need an OT matrix close to the true OT), it
     might be quicker to use the EMD solver.
 
-    Also note that the numpy implementation of the sinkhorn can use parallel
-    computation depending on the configuration of your system but very important
+    Also note that the numpy implementation of Sinkhorn can use parallel
+    computation depending on the configuration of your system, yet very important
     speedup can be obtained by using a GPU implementation since all operations
     are matrix/vector products.
 
@@ -863,11 +1103,6 @@ References
     problems <https://arxiv.org/pdf/1412.5154.pdf>`__. SIAM Journal on
     Scientific Computing, 37(2), A1111-A1138.
 
-.. [4] S. Nakhostin, N. Courty, R. Flamary, D. Tuia, T. Corpetti,
-    `Supervised planetary unmixing with optimal
-    transport <https://hal.archives-ouvertes.fr/hal-01377236/document>`__,
-    Whorkshop on Hyperspectral Image and Signal Processing : Evolution in
-    Remote Sensing (WHISPERS), 2016.
 
 .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, `Optimal Transport
     for Domain Adaptation <https://arxiv.org/pdf/1507.00504.pdf>`__, in IEEE
@@ -955,7 +1190,7 @@ References
     iteration <https://papers.nips.cc/paper/6792-near-linear-time-approximation-algorithms-for-optimal-transport-via-sinkhorn-iteration.pdf>`__,
     Advances in Neural Information Processing Systems (NIPS) 31
 
-.. [23] Aude, G., Peyré, G., Cuturi, M., `Learning Generative Models with
+.. [23] Genevay, A., Peyré, G., Cuturi, M., `Learning Generative Models with
     Sinkhorn Divergences <https://arxiv.org/abs/1706.00292>`__, Proceedings
     of the Twenty-First International Conference on Artficial Intelligence
     and Statistics, (AISTATS) 21, 2018
@@ -972,11 +1207,6 @@ References
 .. [26] Alaya M. Z., Bérar M., Gasso G., Rakotomamonjy A. (2019). Screening Sinkhorn 
 	Algorithm for Regularized Optimal Transport <https://papers.nips.cc/paper/9386-screening-sinkhorn-algorithm-for-regularized-optimal-transport>, 
 	Advances in Neural Information Processing Systems 33 (NeurIPS).
-
-.. [27] Redko I., Courty N., Flamary R., Tuia D. (2019). Optimal Transport for Multi-source 
-	Domain Adaptation under Target Shift <http://proceedings.mlr.press/v89/redko19a.html>, 
-	Proceedings of the Twenty-Second International Conference on Artificial Intelligence 
-	and Statistics (AISTATS) 22, 2019.
 	
 .. [28] Caffarelli, L. A., McCann, R. J. (2020). Free boundaries in optimal transport and 
 	Monge-Ampere obstacle problems <http://www.math.toronto.edu/~mccann/papers/annals2010.pdf>, 
@@ -985,3 +1215,7 @@ References
 .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). Partial Gromov-Wasserstein with 
 	Applications on Positive-Unlabeled Learning <https://arxiv.org/abs/2002.08276>, 
 	arXiv preprint arXiv:2002.08276.
+
+.. [30] Flamary, Rémi, et al. "Optimal transport with Laplacian regularization:
+    Applications to domain adaptation and shape matching." NIPS Workshop on Optimal
+    Transport and Machine Learning OTML. 2014.
diff --git a/docs/source/readme.rst b/docs/source/readme.rst
index b8cb48c..a8f1bc0 100644
--- a/docs/source/readme.rst
+++ b/docs/source/readme.rst
@@ -24,10 +24,9 @@ POT provides the following generic OT solvers (links to examples):
    for regularized OT [7].
 -  Entropic regularization OT solver with `Sinkhorn Knopp
    Algorithm <auto_examples/plot_OT_1D.html>`__
-   [2] , stabilized version [9] [10], greedy Sinkhorn [22] and
+   [2] , stabilized version [9] [10] [34], greedy Sinkhorn [22] and
    `Screening Sinkhorn
-   [26] <auto_examples/plot_screenkhorn_1D.html>`__
-   with optional GPU implementation (requires cupy).
+   [26] <auto_examples/plot_screenkhorn_1D.html>`__.
 -  Bregman projections for `Wasserstein
    barycenter <auto_examples/barycenters/plot_barycenter_lp_vs_entropic.html>`__
    [3], `convolutional
@@ -35,6 +34,9 @@ POT provides the following generic OT solvers (links to examples):
    [21] and unmixing [4].
 -  Sinkhorn divergence [23] and entropic regularization OT from
    empirical data.
+-  Debiased Sinkhorn barycenters `Sinkhorn divergence
+   barycenter <auto_examples/barycenters/plot_debiased_barycenter.html>`__
+   [37]
 -  `Smooth optimal transport
    solvers <auto_examples/plot_OT_1D_smooth.html>`__
    (dual and semi-dual) for KL and squared L2 regularizations [17].
@@ -45,7 +47,8 @@ POT provides the following generic OT solvers (links to examples):
    distances <auto_examples/gromov/plot_gromov.html>`__
    and `GW
    barycenters <auto_examples/gromov/plot_gromov_barycenter.html>`__
-   (exact [13] and regularized [12])
+   (exact [13] and regularized [12]), differentiable using gradients
+   from
 -  `Fused-Gromov-Wasserstein distances
    solver <auto_examples/gromov/plot_fgw.html#sphx-glr-auto-examples-plot-fgw-py>`__
    and `FGW
@@ -55,6 +58,9 @@ POT provides the following generic OT solvers (links to examples):
    solver <auto_examples/plot_stochastic.html>`__
    for Large-scale Optimal Transport (semi-dual problem [18] and dual
    problem [19])
+-  `Stochastic solver of Gromov
+   Wasserstein <auto_examples/gromov/plot_gromov.html>`__
+   for large-scale problem with any loss functions [33]
 -  Non regularized `free support Wasserstein
    barycenters <auto_examples/barycenters/plot_free_support_barycenter.html>`__
    [20].
@@ -66,6 +72,15 @@ POT provides the following generic OT solvers (links to examples):
 -  `Partial Wasserstein and
    Gromov-Wasserstein <auto_examples/unbalanced-partial/plot_partial_wass_and_gromov.html>`__
    (exact [29] and entropic [3] formulations).
+-  `Sliced
+   Wasserstein <auto_examples/sliced-wasserstein/plot_variance.html>`__
+   [31, 32] and Max-sliced Wasserstein [35] that can be used for
+   gradient flows [36].
+-  `Several
+   backends <https://pythonot.github.io/quickstart.html#solving-ot-with-multiple-backends>`__
+   for easy use of POT with
+   `Pytorch <https://pytorch.org/>`__/`jax <https://github.com/google/jax>`__/`Numpy <https://numpy.org/>`__
+   arrays.
 
 POT provides the following Machine Learning related solvers:
 
@@ -96,22 +111,29 @@ Using and citing the toolbox
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If you use this toolbox in your research and find it useful, please cite
-POT using the following reference:
+POT using the following reference from our `JMLR
+paper <https://jmlr.org/papers/v22/20-451.html>`__:
 
 ::
 
-    Rémi Flamary and Nicolas Courty, POT Python Optimal Transport library, 
-    Website: https://pythonot.github.io/, 2017
+    Rémi Flamary, Nicolas Courty, Alexandre Gramfort, Mokhtar Z. Alaya, Aurélie Boisbunon, Stanislas Chambon, Laetitia Chapel, Adrien Corenflos, Kilian Fatras, Nemo Fournier, Léo Gautheron, Nathalie T.H. Gayraud, Hicham Janati, Alain Rakotomamonjy, Ievgen Redko, Antoine Rolet, Antony Schutz, Vivien Seguy, Danica J. Sutherland, Romain Tavenard, Alexander Tong, Titouan Vayer,
+    POT Python Optimal Transport library,
+    Journal of Machine Learning Research, 22(78):1−8, 2021.
+    Website: https://pythonot.github.io/
 
 In Bibtex format:
 
-::
-
-    @misc{flamary2017pot,
-    title={POT Python Optimal Transport library},
-    author={Flamary, R{'e}mi and Courty, Nicolas},
-    url={https://pythonot.github.io/},
-    year={2017}
+.. code:: bibtex
+
+    @article{flamary2021pot,
+      author  = {R{\'e}mi Flamary and Nicolas Courty and Alexandre Gramfort and Mokhtar Z. Alaya and Aur{\'e}lie Boisbunon and Stanislas Chambon and Laetitia Chapel and Adrien Corenflos and Kilian Fatras and Nemo Fournier and L{\'e}o Gautheron and Nathalie T.H. Gayraud and Hicham Janati and Alain Rakotomamonjy and Ievgen Redko and Antoine Rolet and Antony Schutz and Vivien Seguy and Danica J. Sutherland and Romain Tavenard and Alexander Tong and Titouan Vayer},
+      title   = {POT: Python Optimal Transport},
+      journal = {Journal of Machine Learning Research},
+      year    = {2021},
+      volume  = {22},
+      number  = {78},
+      pages   = {1-8},
+      url     = {http://jmlr.org/papers/v22/20-451.html}
     }
 
 Installation
@@ -123,28 +145,21 @@ following Python modules:
 
 -  Numpy (>=1.16)
 -  Scipy (>=1.0)
--  Cython (>=0.23)
--  Matplotlib (>=1.5)
+-  Cython (>=0.23) (build only, not necessary when installing from pip
+   or conda)
 
 Pip installation
 ^^^^^^^^^^^^^^^^
 
-Note that due to a limitation of pip, ``cython`` and ``numpy`` need to
-be installed prior to installing POT. This can be done easily with
-
-::
-
-    pip install numpy cython
-
 You can install the toolbox through PyPI with:
 
-::
+.. code:: console
 
     pip install POT
 
 or get the very latest version by running:
 
-::
+.. code:: console
 
     pip install -U https://github.com/PythonOT/POT/archive/master.zip # with --user for user install (no root)
 
@@ -155,7 +170,7 @@ If you use the Anaconda python distribution, POT is available in
 `conda-forge <https://conda-forge.org>`__. To install it and the
 required dependencies:
 
-::
+.. code:: console
 
     conda install -c conda-forge pot
 
@@ -169,7 +184,8 @@ without errors:
 
     import ot
 
-Note that for easier access the module is name ot instead of pot.
+Note that for easier access the module is named ``ot`` instead of
+``pot``.
 
 Dependencies
 ~~~~~~~~~~~~
@@ -180,15 +196,17 @@ below
 -  **ot.dr** (Wasserstein dimensionality reduction) depends on autograd
    and pymanopt that can be installed with:
 
-   ::
+.. code:: shell
 
-       pip install pymanopt autograd
+    pip install pymanopt autograd
 
 -  **ot.gpu** (GPU accelerated OT) depends on cupy that have to be
    installed following instructions on `this
    page <https://docs-cupy.chainer.org/en/stable/install.html>`__.
-
-obviously you need CUDA installed and a compatible GPU.
+   Obviously you will need CUDA installed and a compatible GPU. Note
+   that this module is deprecated since version 0.8 and will be deleted
+   in the future. GPU is now handled automatically through the backends
+   and several solver already can run on GPU using the Pytorch backend.
 
 Examples
 --------
@@ -198,36 +216,36 @@ Short examples
 
 -  Import the toolbox
 
-   .. code:: python
+.. code:: python
 
-       import ot
+    import ot
 
 -  Compute Wasserstein distances
 
-   .. code:: python
+.. code:: python
 
-       # a,b are 1D histograms (sum to 1 and positive)
-       # M is the ground cost matrix
-       Wd=ot.emd2(a,b,M) # exact linear program
-       Wd_reg=ot.sinkhorn2(a,b,M,reg) # entropic regularized OT
-       # if b is a matrix compute all distances to a and return a vector
+    # a,b are 1D histograms (sum to 1 and positive)
+    # M is the ground cost matrix
+    Wd = ot.emd2(a, b, M) # exact linear program
+    Wd_reg = ot.sinkhorn2(a, b, M, reg) # entropic regularized OT
+    # if b is a matrix compute all distances to a and return a vector
 
 -  Compute OT matrix
 
-   .. code:: python
+.. code:: python
 
-       # a,b are 1D histograms (sum to 1 and positive)
-       # M is the ground cost matrix
-       T=ot.emd(a,b,M) # exact linear program
-       T_reg=ot.sinkhorn(a,b,M,reg) # entropic regularized OT
+    # a,b are 1D histograms (sum to 1 and positive)
+    # M is the ground cost matrix
+    T = ot.emd(a, b, M) # exact linear program
+    T_reg = ot.sinkhorn(a, b, M, reg) # entropic regularized OT
 
 -  Compute Wasserstein barycenter
 
-   .. code:: python
+.. code:: python
 
-       # A is a n*d matrix containing d  1D histograms
-       # M is the ground cost matrix
-       ba=ot.barycenter(A,M,reg) # reg is regularization parameter
+    # A is a n*d matrix containing d  1D histograms
+    # M is the ground cost matrix
+    ba = ot.barycenter(A, M, reg) # reg is regularization parameter
 
 Examples and Notebooks
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -265,10 +283,17 @@ The contributors to this library are
    Rakotomamonjy <https://sites.google.com/site/alainrakotomamonjy/home>`__
 -  `Vayer Titouan <https://tvayer.github.io/>`__ (Gromov-Wasserstein -,
    Fused-Gromov-Wasserstein)
--  `Hicham Janati <https://hichamjanati.github.io/>`__ (Unbalanced OT)
+-  `Hicham Janati <https://hichamjanati.github.io/>`__ (Unbalanced OT,
+   Debiased barycenters)
 -  `Romain Tavenard <https://rtavenar.github.io/>`__ (1d Wasserstein)
 -  `Mokhtar Z. Alaya <http://mzalaya.github.io/>`__ (Screenkhorn)
 -  `Ievgen Redko <https://ievred.github.io/>`__ (Laplacian DA, JCPOT)
+-  `Adrien Corenflos <https://adriencorenflos.github.io/>`__ (Sliced
+   Wasserstein Distance)
+-  `Tanguy Kerdoncuff <https://hv0nnus.github.io/>`__ (Sampled Gromov
+   Wasserstein)
+-  `Minhui Huang <https://mhhuang95.github.io>`__ (Projection Robust
+   Wasserstein Distance)
 
 This toolbox benefit a lot from open source research and we would like
 to thank the following persons for providing some code (in various
@@ -276,6 +301,8 @@ languages):
 
 -  `Gabriel Peyré <http://gpeyre.github.io/>`__ (Wasserstein Barycenters
    in Matlab)
+-  `Mathieu Blondel <https://mblondel.org/>`__ (original implementation
+   smooth OT)
 -  `Nicolas Bonneel <http://liris.cnrs.fr/~nbonneel/>`__ ( C++ code for
    EMD)
 -  `Marco Cuturi <http://marcocuturi.net/>`__ (Sinkhorn Knopp in
@@ -285,20 +312,21 @@ Contributions and code of conduct
 ---------------------------------
 
 Every contribution is welcome and should respect the `contribution
-guidelines <CONTRIBUTING.md>`__. Each member of the project is expected
-to follow the `code of conduct <CODE_OF_CONDUCT.md>`__.
+guidelines <.github/CONTRIBUTING.md>`__. Each member of the project is
+expected to follow the `code of conduct <.github/CODE_OF_CONDUCT.md>`__.
 
 Support
 -------
 
 You can ask questions and join the development discussion:
 
--  On the `POT Slack channel <https://pot-toolbox.slack.com>`__
+-  On the POT `slack channel <https://pot-toolbox.slack.com>`__
+-  On the POT `gitter channel <https://gitter.im/PythonOT/community>`__
 -  On the POT `mailing
    list <https://mail.python.org/mm3/mailman3/lists/pot.python.org/>`__
 
 You can also post bug reports and feature requests in Github issues.
-Make sure to read our `guidelines <CONTRIBUTING.md>`__ first.
+Make sure to read our `guidelines <.github/CONTRIBUTING.md>`__ first.
 
 References
 ----------
@@ -439,10 +467,10 @@ optimal transport and Monge-Ampere obstacle
 problems <http://www.math.toronto.edu/~mccann/papers/annals2010.pdf>`__,
 Annals of mathematics, 673-730.
 
-[29] Chapel, L., Alaya, M., Gasso, G. (2019). `Partial
-Gromov-Wasserstein with Applications on Positive-Unlabeled
-Learning <https://arxiv.org/abs/2002.08276>`__, arXiv preprint
-arXiv:2002.08276.
+[29] Chapel, L., Alaya, M., Gasso, G. (2020). `Partial Optimal Transport
+with Applications on Positive-Unlabeled
+Learning <https://arxiv.org/abs/2002.08276>`__, Advances in Neural
+Information Processing Systems (NeurIPS), 2020.
 
 [30] Flamary R., Courty N., Tuia D., Rakotomamonjy A. (2014). `Optimal
 transport with Laplacian regularization: Applications to domain
@@ -450,11 +478,56 @@ adaptation and shape
 matching <https://remi.flamary.com/biblio/flamary2014optlaplace.pdf>`__,
 NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014.
 
+[31] Bonneel, Nicolas, et al. `Sliced and radon wasserstein barycenters
+of
+measures <https://perso.liris.cnrs.fr/nicolas.bonneel/WassersteinSliced-JMIV.pdf>`__,
+Journal of Mathematical Imaging and Vision 51.1 (2015): 22-45
+
+[32] Huang, M., Ma S., Lai, L. (2021). `A Riemannian Block Coordinate
+Descent Method for Computing the Projection Robust Wasserstein
+Distance <http://proceedings.mlr.press/v139/huang21e.html>`__,
+Proceedings of the 38th International Conference on Machine Learning
+(ICML).
+
+[33] Kerdoncuff T., Emonet R., Marc S. `Sampled Gromov
+Wasserstein <https://hal.archives-ouvertes.fr/hal-03232509/document>`__,
+Machine Learning Journal (MJL), 2021
+
+[34] Feydy, J., Séjourné, T., Vialard, F. X., Amari, S. I., Trouvé, A.,
+& Peyré, G. (2019, April). `Interpolating between optimal transport and
+MMD using Sinkhorn
+divergences <http://proceedings.mlr.press/v89/feydy19a/feydy19a.pdf>`__.
+In The 22nd International Conference on Artificial Intelligence and
+Statistics (pp. 2681-2690). PMLR.
+
+[35] Deshpande, I., Hu, Y. T., Sun, R., Pyrros, A., Siddiqui, N.,
+Koyejo, S., ... & Schwing, A. G. (2019). `Max-sliced wasserstein
+distance and its use for
+gans <https://openaccess.thecvf.com/content_CVPR_2019/papers/Deshpande_Max-Sliced_Wasserstein_Distance_and_Its_Use_for_GANs_CVPR_2019_paper.pdf>`__.
+In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern
+Recognition (pp. 10648-10656).
+
+[36] Liutkus, A., Simsekli, U., Majewski, S., Durmus, A., & Stöter, F.
+R. (2019, May). `Sliced-Wasserstein flows: Nonparametric generative
+modeling via optimal transport and
+diffusions <http://proceedings.mlr.press/v97/liutkus19a/liutkus19a.pdf>`__.
+In International Conference on Machine Learning (pp. 4104-4113). PMLR.
+
+[37] Janati, H., Cuturi, M., Gramfort, A. `Debiased sinkhorn
+barycenters <http://proceedings.mlr.press/v119/janati20a/janati20a.pdf>`__
+Proceedings of the 37th International Conference on Machine Learning,
+PMLR 119:4692-4701, 2020
+
+[38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty,
+`Online Graph Dictionary
+Learning <https://arxiv.org/pdf/2102.06555.pdf>`__, International
+Conference on Machine Learning (ICML), 2021.
+
 .. |PyPI version| image:: https://badge.fury.io/py/POT.svg
    :target: https://badge.fury.io/py/POT
 .. |Anaconda Cloud| image:: https://anaconda.org/conda-forge/pot/badges/version.svg
    :target: https://anaconda.org/conda-forge/pot
-.. |Build Status| image:: https://github.com/PythonOT/POT/workflows/build/badge.svg
+.. |Build Status| image:: https://github.com/PythonOT/POT/workflows/build/badge.svg?branch=master&event=push
    :target: https://github.com/PythonOT/POT/actions
 .. |Codecov Status| image:: https://codecov.io/gh/PythonOT/POT/branch/master/graph/badge.svg
    :target: https://codecov.io/gh/PythonOT/POT
diff --git a/docs/source/releases.rst b/docs/source/releases.rst
index 5a357f3..aa06105 100644
--- a/docs/source/releases.rst
+++ b/docs/source/releases.rst
@@ -1,6 +1,132 @@
 Releases
 ========
 
+0.8.0
+-----
+
+*November 2021*
+
+This new stable release introduces several important features.
+
+First we now have an OpenMP compatible exact ot solver in ``ot.emd``.
+The OpenMP version is used when the parameter ``numThreads`` is greater
+than one and can lead to nice speedups on multi-core machines.
+
+| Second we have introduced a backend mechanism that allows to use
+  standard POT function seamlessly on Numpy, Pytorch and Jax arrays.
+  Other backends are coming but right now POT can be used seamlessly for
+  training neural networks in Pytorch. Notably we propose the first
+  differentiable computation of the exact OT loss with ``ot.emd2`` (can
+  be differentiated w.r.t. both cost matrix and sample weights), but
+  also for the classical Sinkhorn loss with ``ot.sinkhorn2``, the
+  Wasserstein distance in 1D with ``ot.wasserstein_1d``, sliced
+  Wasserstein with ``ot.sliced_wasserstein_distance`` and
+  Gromov-Wasserstein with ``ot.gromov_wasserstein2``. Examples of how
+  this new feature can be used are now available in the documentation
+  where the Pytorch backend is used to estimate a `minimal Wasserstein
+  estimator <https://PythonOT.github.io/auto_examples/backends/plot_unmix_optim_torch.html>`__,
+  a `Generative Network
+  (GAN) <https://PythonOT.github.io/auto_examples/backends/plot_wass2_gan_torch.html>`__,
+  for a `sliced Wasserstein gradient
+  flow <https://PythonOT.github.io/auto_examples/backends/plot_sliced_wass_grad_flow_pytorch.html>`__
+  and `optimizing the Gromov-Wassersein
+  distance <https://PythonOT.github.io/auto_examples/backends/plot_optim_gromov_pytorch.html>`__.
+  Note that the Jax backend is still in early development and quite slow
+  at the moment, we strongly recommend for Jax users to use the `OTT
+  toolbox <https://github.com/google-research/ott>`__ when possible.
+| As a result of this new feature, the old ``ot.gpu`` submodule is now
+  deprecated since GPU implementations can be done using GPU arrays on
+  the torch backends.
+
+Other novel features include implementation for `Sampled Gromov
+Wasserstein and Pointwise Gromov
+Wasserstein <https://PythonOT.github.io/auto_examples/gromov/plot_gromov.html#compute-gw-with-a-scalable-stochastic-method-with-any-loss-function>`__,
+Sinkhorn in log space with ``method='sinkhorn_log'``, `Projection Robust
+Wasserstein <https://PythonOT.github.io/gen_modules/ot.dr.html?highlight=robust#ot.dr.projection_robust_wasserstein>`__,
+ans `deviased Sinkorn
+barycenters <https://PythonOT.github.ioauto_examples/barycenters/plot_debiased_barycenter.html>`__.
+
+This release will also simplify the installation process. We have now a
+``pyproject.toml`` that defines the build dependency and POT should now
+build even when cython is not installed yet. Also we now provide
+pe-compiled wheels for linux ``aarch64`` that is used on Raspberry PI
+and android phones and for MacOS on ARM processors.
+
+Finally POT was accepted for publication in the Journal of Machine
+Learning Research (JMLR) open source software track and we ask the POT
+users to cite `this
+paper <https://www.jmlr.org/papers/v22/20-451.html>`__ from now on. The
+documentation has been improved in particular by adding a "Why OT?"
+section to the quick start guide and several new examples illustrating
+the new features. The documentation now has two version : the stable
+version https://pythonot.github.io/ corresponding to the last release
+and the master version https://pythonot.github.io/master that
+corresponds to the current master branch on GitHub.
+
+As usual, we want to thank all the POT contributors (now 37 people have
+contributed to the toolbox). But for this release we thank in particular
+Nathan Cassereau and Kamel Guerda from the AI support team at
+`IDRIS <http://www.idris.fr/>`__ for their support to the development of
+the backend and OpenMP implementations.
+
+New features
+^^^^^^^^^^^^
+
+-  OpenMP support for exact OT solvers (PR #260)
+-  Backend for running POT in numpy/torch + exact solver (PR #249)
+-  Backend implementation of most functions in ``ot.bregman`` (PR #280)
+-  Backend implementation of most functions in ``ot.optim`` (PR #282)
+-  Backend implementation of most functions in ``ot.gromov`` (PR #294,
+   PR #302)
+-  Test for arrays of different type and device (CPU/GPU) (PR #304,
+   #303)
+-  Implementation of Sinkhorn in log space with
+   ``method='sinkhorn_log'`` (PR #290)
+-  Implementation of regularization path for L2 Unbalanced OT (PR #274)
+-  Implementation of Projection Robust Wasserstein (PR #267)
+-  Implementation of Debiased Sinkhorn Barycenters (PR #291)
+-  Implementation of Sampled Gromov Wasserstein and Pointwise Gromov
+   Wasserstein (PR #275)
+-  Add ``pyproject.toml`` and build POT without installing cython first
+   (PR #293)
+-  Lazy implementation in log space for sinkhorn on samples (PR #259)
+-  Documentation cleanup (PR #298)
+-  Two up-to-date documentations `for stable
+   release <https://PythonOT.github.io/>`__ and for `master
+   branch <https://pythonot.github.io/master/>`__.
+-  Building wheels on ARM for Raspberry PI and smartphones (PR #238)
+-  Update build wheels to new version and new pythons (PR #236, #253)
+-  Implementation of sliced Wasserstein distance (Issue #202, PR #203)
+-  Add minimal build to CI and perform pep8 test separately (PR #210)
+-  Speedup of tests and return run time (PR #262)
+-  Add "Why OT" discussion to the documentation (PR #220)
+-  New introductory example to discrete OT in the documentation (PR
+   #191)
+-  Add templates for Issues/PR on Github (PR#181)
+
+Closed issues
+^^^^^^^^^^^^^
+
+-  Debug Memory leak in GAN example (#254)
+-  DEbug GPU bug (Issue #284, #287, PR #288)
+-  set\_gradients method for JAX backend (PR #278)
+-  Quicker GAN example for CircleCI build (PR #258)
+-  Better formatting in Readme (PR #234)
+-  Debug CI tests (PR #240, #241, #242)
+-  Bug in Partial OT solver dummy points (PR #215)
+-  Bug when Armijo linesearch (Issue #184, #198, #281, PR #189, #199,
+   #286)
+-  Bug Barycenter Sinkhorn (Issue 134, PR #195)
+-  Infeasible solution in exact OT (Issues #126,#93, PR #217)
+-  Doc for SUpport Barycenters (Issue #200, PR #201)
+-  Fix labels transport in BaseTransport (Issue #207, PR #208)
+-  Bug in ``emd_1d``, non respected bounds (Issue #169, PR #170)
+-  Removed Python 2.7 support and update codecov file (PR #178)
+-  Add normalization for WDA and test it (PR #172, #296)
+-  Cleanup code for new version of ``flake8`` (PR #176)
+-  Fixed requirements in ``setup.py`` (PR #174)
+-  Removed specific MacOS flags (PR #175)
+
 0.7.0
 -----
 
@@ -50,7 +176,7 @@ problems.
 
 This release is also the moment to thank all the POT contributors (old
 and new) for helping making POT such a nice toolbox. A lot of changes
-(also in the API) are comming for the next versions.
+(also in the API) are coming for the next versions.
 
 Features
 ^^^^^^^^
@@ -72,6 +198,8 @@ Features
 Closed issues
 ^^^^^^^^^^^^^
 
+-  Add JMLR paper to teh readme ad Mathieu Blondel to the Acknoledgments
+   (PR #231, #232)
 -  Bug in Unbalanced OT example (Issue #127)
 -  Clean Cython output when calling setup.py clean (Issue #122)
 -  Various Macosx compilation problems (Issue #113, Issue #118, PR#130)
@@ -103,8 +231,8 @@ mathematical problems and research but with the new contributions we now
 implement algorithms and solvers from 24 scientific papers (listed in
 the README.md file). New features include a direct implementation of the
 `empirical Sinkhorn
-divergence <all.html#ot.bregman.empirical_sinkhorn_divergence>`__
-, a new efficient (Cython implementation) solver for `EMD in
+divergence <all.html#ot.bregman.empirical_sinkhorn_divergence>`__,
+a new efficient (Cython implementation) solver for `EMD in
 1D <all.html#ot.lp.emd_1d>`__ and
 corresponding `Wasserstein
 1D <all.html#ot.lp.wasserstein_1d>`__.