From f63f34f8adb6943b6410f8b773b4b4d8f1c7b4ba Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 20 Jun 2019 14:29:56 +0200 Subject: EMD 1d without doc --- ot/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ot/__init__.py') diff --git a/ot/__init__.py b/ot/__init__.py index b74b924..5d5b700 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2 +from .lp import emd, emd2, emd_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -31,6 +31,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" -__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets', +__all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] -- cgit v1.2.3 From 18502d6861a4977cbade957f2e48eeb8dbb55414 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 11:21:08 +0200 Subject: Sparse G matrix for EMD1d + standard metrics computed without cdist --- ot/__init__.py | 4 ++-- ot/lp/emd_wrap.pyx | 29 +++++++++++++++++++++-------- test/test_ot.py | 23 ++++++++++++++++++----- 3 files changed, 41 insertions(+), 15 deletions(-) (limited to 'ot/__init__.py') diff --git a/ot/__init__.py b/ot/__init__.py index 5d5b700..f0e526c 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d +from .lp import emd, emd2, emd_1d, emd2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,5 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2966206..ab88d7f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -101,8 +101,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod @cython.wraparound(False) def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, - np.ndarray[double, ndim=2, mode="c"] u, - np.ndarray[double, ndim=2, mode="c"] v, + np.ndarray[double, ndim=1, mode="c"] u, + np.ndarray[double, ndim=1, mode="c"] v, str metric='sqeuclidean'): r""" Roro's stuff @@ -118,21 +118,34 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef double m_ij = 0. - cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), + cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ), dtype=np.float64) + cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2), + dtype=np.int) + cdef int cur_idx = 0 while i < n and j < m: - m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), - metric=metric)[0, 0] + if metric == 'sqeuclidean': + m_ij = (u[i] - v[j]) ** 2 + elif metric == 'cityblock' or metric == 'euclidean': + m_ij = np.abs(u[i] - v[j]) + else: + m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), + metric=metric)[0, 0] if w_i < w_j or j == m - 1: cost += m_ij * w_i - G[i, j] = w_i + G[cur_idx] = w_i + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j i += 1 w_j -= w_i w_i = u_weights[i] else: cost += m_ij * w_j - G[i, j] = w_j + G[cur_idx] = w_j + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j j += 1 w_i -= w_j w_j = v_weights[j] - return G, cost \ No newline at end of file + cur_idx += 1 + return G[:cur_idx], indices[:cur_idx], cost diff --git a/test/test_ot.py b/test/test_ot.py index 7008002..2a2e0a5 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -7,6 +7,7 @@ import warnings import numpy as np +from scipy.stats import wasserstein_distance import ot from ot.datasets import make_1D_gauss as gauss @@ -37,7 +38,7 @@ def test_emd_emd2(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn @@ -46,12 +47,13 @@ def test_emd_emd2(): np.testing.assert_allclose(w, 0) -def test_emd1d(): +def test_emd_1d_emd2_1d(): # test emd1d gives similar results as emd n = 20 m = 30 - u = np.random.randn(n, 1) - v = np.random.randn(m, 1) + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) M = ot.dist(u, v, metric='sqeuclidean') @@ -59,9 +61,20 @@ def test_emd1d(): wass = log["cost"] G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) wass1d = log["cost"] + wass1d_emd2 = ot.emd2_1d([], [], u, v, metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d([], [], u, v, metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) + np.testing.assert_allclose(wass, wass1d_emd2) + + # check loss is similar to scipy's implementation for Euclidean metric + wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, ))) + np.testing.assert_allclose(wass_sp, wass1d_euc) + + # check constraints + np.testing.assert_allclose(np.ones((n, )) / n, G.sum(1)) + np.testing.assert_allclose(np.ones((m, )) / m, G.sum(0)) # check G is similar np.testing.assert_allclose(G, G_1d) @@ -86,7 +99,7 @@ def test_emd_empty(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn -- cgit v1.2.3 From 0d333e004636f5d25edea6bb195e8e4d9a95ba98 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 10:23:32 +0200 Subject: Improved tests and docs for wasserstein_1d --- ot/__init__.py | 5 +++-- ot/lp/__init__.py | 13 ++++++------- ot/lp/emd_wrap.pyx | 3 ++- test/test_ot.py | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 10 deletions(-) (limited to 'ot/__init__.py') diff --git a/ot/__init__.py b/ot/__init__.py index f0e526c..5bd9bb3 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 719032b..76c9ec0 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -530,13 +530,13 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns + """Solves the p-Wasserstein distance problem between 1d measures and returns the OT matrix .. math:: - \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, @@ -617,15 +617,14 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): dense=dense, log=log) -def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., - dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns +def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): + """Solves the p-Wasserstein distance problem between 1d measures and returns the loss .. math:: \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 7134136..42b848f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -13,6 +13,7 @@ cimport numpy as np from ..utils import dist cimport cython +cimport libc.math as math import warnings @@ -159,7 +160,7 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, elif metric == 'cityblock' or metric == 'euclidean': m_ij = abs(u[i] - v[j]) elif metric == 'minkowski': - m_ij = abs(u[i] - v[j]) ** p + m_ij = math.pow(abs(u[i] - v[j]), p) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] diff --git a/test/test_ot.py b/test/test_ot.py index 6d6ea26..48423e7 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -85,6 +85,29 @@ def test_emd_1d_emd2_1d(): np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], []) +def test_wass_1d(): + # test emd1d gives similar results as emd + n = 20 + m = 30 + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd([], [], M, log=True) + wass = log["cost"] + + G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) + wass1d = log["cost"] + + # check loss is similar + np.testing.assert_allclose(np.sqrt(wass), wass1d) + + # check G is similar + np.testing.assert_allclose(G, G_1d) + + def test_emd_empty(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From c92e595009ad5e2ae6d4b2c040556cffb6316847 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 11:08:15 +0200 Subject: Wasserstein defined as the cost itself (do not return transportation matrix) --- ot/__init__.py | 4 +- ot/lp/__init__.py | 125 +++++------------------------------------------------- test/test_ot.py | 6 +-- 3 files changed, 13 insertions(+), 122 deletions(-) (limited to 'ot/__init__.py') diff --git a/ot/__init__.py b/ot/__init__.py index 730aa4f..1b3c2fb 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -23,7 +23,7 @@ from . import stochastic from . import unbalanced # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced from .da import sinkhorn_lpl1_mm @@ -35,6 +35,6 @@ __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim', 'sinkhorn_unbalanced', "barycenter_unbalanced"] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 76c9ec0..a3f5b8d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -21,7 +21,7 @@ from .cvx import barycenter from ..utils import dist __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d'] + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -529,9 +529,9 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, return cost -def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): +def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.): """Solves the p-Wasserstein distance problem between 1d measures and returns - the OT matrix + the distance .. math:: @@ -560,22 +560,11 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): Target histogram (default is uniform weight) p: float, optional (default=1.0) The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics - are used. - log: boolean, optional (default=False) - If True, returns a dictionary containing the cost. - Otherwise returns only the optimal transportation matrix. Returns ------- - gamma: (ns, nt) ndarray - Optimal transportation matrix for the given parameters - log: dict - If input log is True, a dictionary containing the cost + dist: float + p-Wasserstein distance Examples @@ -590,96 +579,8 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.wasserstein_1d(x_a, x_b, a, b) - array([[0. , 0.5], - [0.5, 0. ]]) - >>> ot.wasserstein_1d(x_a, x_b) - array([[0. , 0.5], - [0.5, 0. ]]) - - References - ---------- - - .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal - Transport", 2018. - - See Also - -------- - ot.lp.emd_1d : EMD for 1d distributions - ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost - instead of the transportation matrix) - """ - if log: - G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - log['cost'] = np.power(log['cost'], 1. / p) - return G, log - return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - - -def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the p-Wasserstein distance problem between 1d measures and returns - the loss - - - .. math:: - \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \\right)^{1/p} - - s.t. \gamma 1 = a, - \gamma^T 1= b, - \gamma\geq 0 - where : - - - x_a and x_b are the samples - - a and b are the sample weights - - Uses the algorithm detailed in [1]_ - - Parameters - ---------- - x_a : (ns,) or (ns, 1) ndarray, float64 - Source dirac locations (on the real line) - x_b : (nt,) or (ns, 1) ndarray, float64 - Target dirac locations (on the real line) - a : (ns,) ndarray, float64, optional - Source histogram (default is uniform weight) - b : (nt,) ndarray, float64, optional - Target histogram (default is uniform weight) - p: float, optional (default=1.0) - The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Only used if log is set to True. Due to implementation details, - this function runs faster when dense is set to False. - log: boolean, optional (default=False) - If True, returns a dictionary containing the transportation matrix. - Otherwise returns only the loss. - - Returns - ------- - loss: float - Cost associated to the optimal transportation - log: dict - If input log is True, a dictionary containing the Optimal transportation - matrix for the given parameters - - - Examples - -------- - - Simple example with obvious solution. The function wasserstein2_1d accepts - lists and performs automatic conversion to numpy arrays - - >>> import ot - >>> a=[.5, .5] - >>> b=[.5, .5] - >>> x_a = [2., 0.] - >>> x_b = [0., 3.] - >>> ot.wasserstein2_1d(x_a, x_b, a, b) 0.5 - >>> ot.wasserstein2_1d(x_a, x_b) + >>> ot.wasserstein_1d(x_a, x_b) 0.5 References @@ -690,14 +591,8 @@ def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): See Also -------- - ot.lp.emd2_1d : EMD for 1d distributions - ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the - transportation matrix instead of the cost) + ot.lp.emd_1d : EMD for 1d distributions """ - if log: - cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - cost = np.power(cost, 1. / p) - return cost, log - return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log), 1. / p) \ No newline at end of file + cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=False, log=False) + return np.power(cost_emd, 1. / p) diff --git a/test/test_ot.py b/test/test_ot.py index 48423e7..3c4ac11 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -98,15 +98,11 @@ def test_wass_1d(): G, log = ot.emd([], [], M, log=True) wass = log["cost"] - G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) - wass1d = log["cost"] + wass1d = ot.wasserstein_1d(u, v, [], [], p=2.) # check loss is similar np.testing.assert_allclose(np.sqrt(wass), wass1d) - # check G is similar - np.testing.assert_allclose(G, G_1d) - def test_emd_empty(): # test emd and emd2 for simple identity -- cgit v1.2.3