Merge pull request #89 from rtavenar/master

[MRG] EMD and Wasserstein 1D
author: Rémi Flamary <remi.flamary@gmail.com> 2019-06-27 14:08:21 +0200
committer: GitHub <noreply@github.com> 2019-06-27 14:08:21 +0200
commit: a9b8af146648ee2ae50baf46e69e6281f6b279e4 (patch)
tree: c97a7359bdf7de19d7a1cc325304852622a8f580
parent: 2364d56aad650d501753cc93a69ea1b8ddf28b0a (diff)
parent: 362a7f8fa20cf7ae6f2e36d7e47c7ca9f81d3c51 (diff)
5 files changed, 445 insertions, 9 deletions
diff --git a/README.md b/README.md
index d24d8b9..84148f8 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,7 @@ The contributors to this library are:
 * [Alain Rakotomamonjy](https://sites.google.com/site/alainrakotomamonjy/home)
 * [Vayer Titouan](https://tvayer.github.io/)
 * [Hicham Janati](https://hichamjanati.github.io/) (Unbalanced OT)
+* [Romain Tavenard](https://rtavenar.github.io/) (1d Wasserstein)
 
 This toolbox benefit a lot from open source research and we would like to thank the following persons for providing some code (in various languages):
 
diff --git a/ot/__init__.py b/ot/__init__.py
index 3892d1d..1b3c2fb 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -23,7 +23,7 @@ from . import stochastic
 from . import unbalanced
 
 # OT functions
-from .lp import emd, emd2
+from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d
 from .bregman import sinkhorn, sinkhorn2, barycenter
 from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced
 from .da import sinkhorn_lpl1_mm
@@ -33,7 +33,8 @@ from .utils import dist, unif, tic, toc, toq
 
 __version__ = "0.5.1"
 
-__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets',
+__all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets',
            'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov',
+           'emd_1d', 'emd2_1d', 'wasserstein_1d',
            'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim',
            'sinkhorn_unbalanced', "barycenter_unbalanced"]
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 02cbd8c..a3f5b8d 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -10,16 +10,18 @@ Solvers for the original linear program OT problem
 import multiprocessing
 
 import numpy as np
+from scipy.sparse import coo_matrix
 
 from .import cvx
 
 # import compiled emd
-from .emd_wrap import emd_c, check_result
+from .emd_wrap import emd_c, check_result, emd_1d_sorted
 from ..utils import parmap
 from .cvx import barycenter
 from ..utils import dist
 
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
+__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+         'emd_1d', 'emd2_1d', 'wasserstein_1d']
 
 
 def emd(a, b, M, numItermax=100000, log=False):
@@ -94,7 +96,7 @@ def emd(a, b, M, numItermax=100000, log=False):
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -187,7 +189,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -308,4 +310,289 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         log_dict['displacement_square_norms'] = displacement_square_norms
         return X, log_dict
     else:
-        return X
-\ No newline at end of file
+        return X
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+           log=False):
+    """Solves the Earth Movers distance problem between 1d measures and returns
+    the OT matrix
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the cost.
+        Otherwise returns only the optimal transportation matrix.
+
+    Returns
+    -------
+    gamma: (ns, nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log: dict
+        If input log is True, a dictionary containing the cost
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd_1d(x_a, x_b, a, b)
+    array([[0. ,  0.5],
+           [0.5,  0. ]])
+    >>> ot.emd_1d(x_a, x_b)
+    array([[0. ,  0.5],
+           [0.5,  0. ]])
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd : EMD for multidimensional distributions
+    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+        transportation matrix)
+    """
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    x_a = np.asarray(x_a, dtype=np.float64)
+    x_b = np.asarray(x_b, dtype=np.float64)
+
+    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+
+    # if empty array given then use uniform distributions
+    if a.ndim == 0 or len(a) == 0:
+        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
+    if b.ndim == 0 or len(b) == 0:
+        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+
+    x_a_1d = x_a.reshape((-1, ))
+    x_b_1d = x_b.reshape((-1, ))
+    perm_a = np.argsort(x_a_1d)
+    perm_b = np.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(a, b,
+                                            x_a_1d[perm_a], x_b_1d[perm_b],
+                                            metric=metric, p=p)
+    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
+                   shape=(a.shape[0], b.shape[0]))
+    if dense:
+        G = G.toarray()
+    if log:
+        log = {'cost': cost}
+        return G, log
+    return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+            log=False):
+    """Solves the Earth Movers distance problem between 1d measures and returns
+    the loss
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Only used if log is set to True. Due to implementation details,
+        this function runs faster when dense is set to False.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the transportation matrix.
+        Otherwise returns only the loss.
+
+    Returns
+    -------
+    loss: float
+        Cost associated to the optimal transportation
+    log: dict
+        If input log is True, a dictionary containing the Optimal transportation
+        matrix for the given parameters
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd2_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd2_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.emd2_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd2 : EMD for multidimensional distributions
+    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+        instead of the cost)
+    """
+    # If we do not return G (log==False), then we should not to cast it to dense
+    # (useless overhead)
+    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+                        dense=dense and log, log=True)
+    cost = log_emd['cost']
+    if log:
+        log_emd = {'G': G}
+        return cost, log_emd
+    return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
+    """Solves the p-Wasserstein distance problem between 1d measures and returns
+    the distance
+
+
+    .. math::
+        \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij}
+            |x_a[i] - x_b[j]|^p \\right)^{1/p}
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    p: float, optional (default=1.0)
+         The order of the p-Wasserstein distance to be computed
+
+    Returns
+    -------
+    dist: float
+        p-Wasserstein distance
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function wasserstein_1d accepts
+    lists and performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.wasserstein_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.wasserstein_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd_1d : EMD for 1d distributions
+    """
+    cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+                       dense=False, log=False)
+    return np.power(cost_emd, 1. / p)
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 83ee6aa..8a4aec9 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -10,7 +10,10 @@ Cython linker with C solver
 import numpy as np
 cimport numpy as np
 
+from ..utils import dist
+
 cimport cython
+cimport libc.math as math
 
 import warnings
 
@@ -93,3 +96,89 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
 
     return G, cost, alpha, beta, result_code
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
+                  np.ndarray[double, ndim=1, mode="c"] v_weights,
+                  np.ndarray[double, ndim=1, mode="c"] u,
+                  np.ndarray[double, ndim=1, mode="c"] v,
+                  str metric='sqeuclidean',
+                  double p=1.):
+    r"""
+    Solves the Earth Movers distance problem between sorted 1d measures and
+    returns the OT matrix and the associated cost
+
+    Parameters
+    ----------
+    u_weights : (ns,) ndarray, float64
+        Source histogram
+    v_weights : (nt,) ndarray, float64
+        Target histogram
+    u : (ns,) ndarray, float64
+        Source dirac locations (on the real line)
+    v : (nt,) ndarray, float64
+        Target dirac locations (on the real line)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+
+    Returns
+    -------
+    gamma: (n, ) ndarray, float64
+        Values in the Optimal transportation matrix
+    indices: (n, 2) ndarray, int64
+        Indices of the values stored in gamma for the Optimal transportation
+        matrix
+    cost
+        cost associated to the optimal transportation
+    """
+    cdef double cost = 0.
+    cdef int n = u_weights.shape[0]
+    cdef int m = v_weights.shape[0]
+
+    cdef int i = 0
+    cdef double w_i = u_weights[0]
+    cdef int j = 0
+    cdef double w_j = v_weights[0]
+
+    cdef double m_ij = 0.
+
+    cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ),
+                                                           dtype=np.float64)
+    cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
+                                                              dtype=np.int)
+    cdef int cur_idx = 0
+    while i < n and j < m:
+        if metric == 'sqeuclidean':
+            m_ij = (u[i] - v[j]) * (u[i] - v[j])
+        elif metric == 'cityblock' or metric == 'euclidean':
+            m_ij = math.fabs(u[i] - v[j])
+        elif metric == 'minkowski':
+            m_ij = math.pow(math.fabs(u[i] - v[j]), p)
+        else:
+            m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)),
+                        metric=metric)[0, 0]
+        if w_i < w_j or j == m - 1:
+            cost += m_ij * w_i
+            G[cur_idx] = w_i
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            i += 1
+            w_j -= w_i
+            w_i = u_weights[i]
+        else:
+            cost += m_ij * w_j
+            G[cur_idx] = w_j
+            indices[cur_idx, 0] = i
+            indices[cur_idx, 1] = j
+            j += 1
+            w_i -= w_j
+            w_j = v_weights[j]
+        cur_idx += 1
+    return G[:cur_idx], indices[:cur_idx], cost
diff --git a/test/test_ot.py b/test/test_ot.py
index 7652394..3c4ac11 100644
--- a/test/test_ot.py
+++ b/test/test_ot.py
@@ -7,6 +7,7 @@
 import warnings
 
 import numpy as np
+from scipy.stats import wasserstein_distance
 
 import ot
 from ot.datasets import make_1D_gauss as gauss
@@ -37,7 +38,7 @@ def test_emd_emd2():
 
     # check G is identity
     np.testing.assert_allclose(G, np.eye(n) / n)
-    # check constratints
+    # check constraints
     np.testing.assert_allclose(u, G.sum(1))  # cf convergence sinkhorn
     np.testing.assert_allclose(u, G.sum(0))  # cf convergence sinkhorn
 
@@ -46,6 +47,63 @@ def test_emd_emd2():
     np.testing.assert_allclose(w, 0)
 
 
+def test_emd_1d_emd2_1d():
+    # test emd1d gives similar results as emd
+    n = 20
+    m = 30
+    rng = np.random.RandomState(0)
+    u = rng.randn(n, 1)
+    v = rng.randn(m, 1)
+
+    M = ot.dist(u, v, metric='sqeuclidean')
+
+    G, log = ot.emd([], [], M, log=True)
+    wass = log["cost"]
+    G_1d, log = ot.emd_1d(u, v, [], [], metric='sqeuclidean', log=True)
+    wass1d = log["cost"]
+    wass1d_emd2 = ot.emd2_1d(u, v, [], [], metric='sqeuclidean', log=False)
+    wass1d_euc = ot.emd2_1d(u, v, [], [], metric='euclidean', log=False)
+
+    # check loss is similar
+    np.testing.assert_allclose(wass, wass1d)
+    np.testing.assert_allclose(wass, wass1d_emd2)
+
+    # check loss is similar to scipy's implementation for Euclidean metric
+    wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, )))
+    np.testing.assert_allclose(wass_sp, wass1d_euc)
+
+    # check constraints
+    np.testing.assert_allclose(np.ones((n, )) / n, G.sum(1))
+    np.testing.assert_allclose(np.ones((m, )) / m, G.sum(0))
+
+    # check G is similar
+    np.testing.assert_allclose(G, G_1d)
+
+    # check AssertionError is raised if called on non 1d arrays
+    u = np.random.randn(n, 2)
+    v = np.random.randn(m, 2)
+    np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], [])
+
+
+def test_wass_1d():
+    # test emd1d gives similar results as emd
+    n = 20
+    m = 30
+    rng = np.random.RandomState(0)
+    u = rng.randn(n, 1)
+    v = rng.randn(m, 1)
+
+    M = ot.dist(u, v, metric='sqeuclidean')
+
+    G, log = ot.emd([], [], M, log=True)
+    wass = log["cost"]
+
+    wass1d = ot.wasserstein_1d(u, v, [], [], p=2.)
+
+    # check loss is similar
+    np.testing.assert_allclose(np.sqrt(wass), wass1d)
+
+
 def test_emd_empty():
     # test emd and emd2 for simple identity
     n = 100
@@ -60,7 +118,7 @@ def test_emd_empty():
 
     # check G is identity
     np.testing.assert_allclose(G, np.eye(n) / n)
-    # check constratints
+    # check constraints
     np.testing.assert_allclose(u, G.sum(1))  # cf convergence sinkhorn
     np.testing.assert_allclose(u, G.sum(0))  # cf convergence sinkhorn
author	Rémi Flamary <remi.flamary@gmail.com>	2019-06-27 14:08:21 +0200
committer	GitHub <noreply@github.com>	2019-06-27 14:08:21 +0200
commit	a9b8af146648ee2ae50baf46e69e6281f6b279e4 (patch)
tree	c97a7359bdf7de19d7a1cc325304852622a8f580
parent	2364d56aad650d501753cc93a69ea1b8ddf28b0a (diff)
parent	362a7f8fa20cf7ae6f2e36d7e47c7ca9f81d3c51 (diff)