From f63f34f8adb6943b6410f8b773b4b4d8f1c7b4ba Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 20 Jun 2019 14:29:56 +0200 Subject: EMD 1d without doc --- ot/__init__.py | 4 ++-- ot/lp/__init__.py | 43 ++++++++++++++++++++++++++++++++++++++----- ot/lp/emd_wrap.pyx | 35 +++++++++++++++++++++++++++++++++++ test/test_ot.py | 26 ++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 7 deletions(-) diff --git a/ot/__init__.py b/ot/__init__.py index b74b924..5d5b700 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2 +from .lp import emd, emd2, emd_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -31,6 +31,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" -__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets', +__all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 02cbd8c..49ded5b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -14,12 +14,12 @@ import numpy as np from .import cvx # import compiled emd -from .emd_wrap import emd_c, check_result +from .emd_wrap import emd_c, check_result, emd_1d_sorted from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx'] +__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d_sorted'] def emd(a, b, M, numItermax=100000, log=False): @@ -94,7 +94,7 @@ def emd(a, b, M, numItermax=100000, log=False): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use unifor distributions + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] if len(b) == 0: @@ -187,7 +187,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use unifor distributions + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] if len(b) == 0: @@ -308,4 +308,37 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None log_dict['displacement_square_norms'] = displacement_square_norms return X, log_dict else: - return X \ No newline at end of file + return X + + +def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): + """Solves the Earth Movers distance problem between 1d measures and returns + the OT matrix + + """ + assert x_a.shape[1] == x_b.shape[1] == 1, "emd_1d should only be used " + \ + "with monodimensional data" + + a = np.asarray(a, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + + # if empty array given then use uniform distributions + if len(a) == 0: + a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0] + if len(b) == 0: + b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] + + perm_a = np.argsort(x_a.reshape((-1, ))) + perm_b = np.argsort(x_b.reshape((-1, ))) + inv_perm_a = np.argsort(perm_a) + inv_perm_b = np.argsort(perm_b) + + M = dist(x_a[perm_a], x_b[perm_b], metric=metric) + + G_sorted, cost = emd_1d_sorted(a, b, M) + G = G_sorted[inv_perm_a, :][:, inv_perm_b] + if log: + log = {} + log['cost'] = cost + return G, log + return G \ No newline at end of file diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 83ee6aa..a3d189d 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -93,3 +93,38 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod cdef int result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) return G, cost, alpha, beta, result_code + + +@cython.boundscheck(False) +@cython.wraparound(False) +def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, + np.ndarray[double, ndim=1, mode="c"] v_weights, + np.ndarray[double, ndim=2, mode="c"] M): + r""" + Roro's stuff + """ + cdef double cost = 0. + cdef int n = u_weights.shape[0] + cdef int m = v_weights.shape[0] + + cdef int i = 0 + cdef double w_i = u_weights[0] + cdef int j = 0 + cdef double w_j = v_weights[0] + + cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), + dtype=np.float64) + while i < n and j < m: + if w_i < w_j or j == m - 1: + cost += M[i, j] * w_i + G[i, j] = w_i + i += 1 + w_j -= w_i + w_i = u_weights[i] + else: + cost += M[i, j] * w_j + G[i, j] = w_j + j += 1 + w_i -= w_j + w_j = v_weights[j] + return G, cost \ No newline at end of file diff --git a/test/test_ot.py b/test/test_ot.py index 7652394..7008002 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -46,6 +46,32 @@ def test_emd_emd2(): np.testing.assert_allclose(w, 0) +def test_emd1d(): + # test emd1d gives similar results as emd + n = 20 + m = 30 + u = np.random.randn(n, 1) + v = np.random.randn(m, 1) + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd([], [], M, log=True) + wass = log["cost"] + G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) + wass1d = log["cost"] + + # check loss is similar + np.testing.assert_allclose(wass, wass1d) + + # check G is similar + np.testing.assert_allclose(G, G_1d) + + # check AssertionError is raised if called on non 1d arrays + u = np.random.randn(n, 2) + v = np.random.randn(m, 2) + np.testing.assert_raises(AssertionError, ot.emd_1d, [], [], u, v) + + def test_emd_empty(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From 15b21611a3a93043d30c4eaaf9d622200453a884 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 20 Jun 2019 14:52:23 +0200 Subject: EMD 1d without doc made faster --- ot/lp/__init__.py | 5 ++--- ot/lp/emd_wrap.pyx | 14 +++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 49ded5b..c4457dc 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -333,9 +333,8 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): inv_perm_a = np.argsort(perm_a) inv_perm_b = np.argsort(perm_b) - M = dist(x_a[perm_a], x_b[perm_b], metric=metric) - - G_sorted, cost = emd_1d_sorted(a, b, M) + G_sorted, cost = emd_1d_sorted(a, b, x_a[perm_a], x_b[perm_b], + metric=metric) G = G_sorted[inv_perm_a, :][:, inv_perm_b] if log: log = {} diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index a3d189d..2966206 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -10,6 +10,8 @@ Cython linker with C solver import numpy as np cimport numpy as np +from ..utils import dist + cimport cython import warnings @@ -99,7 +101,9 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod @cython.wraparound(False) def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, - np.ndarray[double, ndim=2, mode="c"] M): + np.ndarray[double, ndim=2, mode="c"] u, + np.ndarray[double, ndim=2, mode="c"] v, + str metric='sqeuclidean'): r""" Roro's stuff """ @@ -112,17 +116,21 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef int j = 0 cdef double w_j = v_weights[0] + cdef double m_ij = 0. + cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), dtype=np.float64) while i < n and j < m: + m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), + metric=metric)[0, 0] if w_i < w_j or j == m - 1: - cost += M[i, j] * w_i + cost += m_ij * w_i G[i, j] = w_i i += 1 w_j -= w_i w_i = u_weights[i] else: - cost += M[i, j] * w_j + cost += m_ij * w_j G[i, j] = w_j j += 1 w_i -= w_j -- cgit v1.2.3 From cada9a3019997e8efb95d96c86985110f1e937b9 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 11:15:42 +0200 Subject: Sparse G matrix for EMD1d + standard metrics computed without cdist --- ot/lp/__init__.py | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index c4457dc..decff29 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -10,6 +10,7 @@ Solvers for the original linear program OT problem import multiprocessing import numpy as np +from scipy.sparse import coo_matrix from .import cvx @@ -19,7 +20,8 @@ from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d_sorted'] +__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', + 'emd_1d', 'emd2_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -311,16 +313,20 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): +def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix """ - assert x_a.shape[1] == x_b.shape[1] == 1, "emd_1d should only be used " + \ - "with monodimensional data" - a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) + x_a = np.asarray(x_a, dtype=np.float64) + x_b = np.asarray(x_b, dtype=np.float64) + + assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \ + "emd_1d should only be used with monodimensional data" + assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \ + "emd_1d should only be used with monodimensional data" # if empty array given then use uniform distributions if len(a) == 0: @@ -328,16 +334,36 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): if len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] - perm_a = np.argsort(x_a.reshape((-1, ))) - perm_b = np.argsort(x_b.reshape((-1, ))) - inv_perm_a = np.argsort(perm_a) - inv_perm_b = np.argsort(perm_b) - - G_sorted, cost = emd_1d_sorted(a, b, x_a[perm_a], x_b[perm_b], - metric=metric) - G = G_sorted[inv_perm_a, :][:, inv_perm_b] + x_a_1d = x_a.reshape((-1, )) + x_b_1d = x_b.reshape((-1, )) + perm_a = np.argsort(x_a_1d) + perm_b = np.argsort(x_b_1d) + + G_sorted, indices, cost = emd_1d_sorted(a, b, + x_a_1d[perm_a], x_b_1d[perm_b], + metric=metric) + G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), + shape=(a.shape[0], b.shape[0])) + if dense: + G = G.todense() if log: log = {} log['cost'] = cost return G, log - return G \ No newline at end of file + return G + + +def emd2_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): + """Solves the Earth Movers distance problem between 1d measures and returns + the loss + + """ + # If we do not return G (log==False), then we should not to cast it to dense + # (useless overhead) + G, log_emd = emd_1d(a=a, b=b, x_a=x_a, x_b=x_b, metric=metric, + dense=dense and log, log=True) + cost = log_emd['cost'] + if log: + log_emd = {'G': G} + return cost, log_emd + return cost \ No newline at end of file -- cgit v1.2.3 From 18502d6861a4977cbade957f2e48eeb8dbb55414 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 11:21:08 +0200 Subject: Sparse G matrix for EMD1d + standard metrics computed without cdist --- ot/__init__.py | 4 ++-- ot/lp/emd_wrap.pyx | 29 +++++++++++++++++++++-------- test/test_ot.py | 23 ++++++++++++++++++----- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/ot/__init__.py b/ot/__init__.py index 5d5b700..f0e526c 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d +from .lp import emd, emd2, emd_1d, emd2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,5 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2966206..ab88d7f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -101,8 +101,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod @cython.wraparound(False) def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, - np.ndarray[double, ndim=2, mode="c"] u, - np.ndarray[double, ndim=2, mode="c"] v, + np.ndarray[double, ndim=1, mode="c"] u, + np.ndarray[double, ndim=1, mode="c"] v, str metric='sqeuclidean'): r""" Roro's stuff @@ -118,21 +118,34 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef double m_ij = 0. - cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), + cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ), dtype=np.float64) + cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2), + dtype=np.int) + cdef int cur_idx = 0 while i < n and j < m: - m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), - metric=metric)[0, 0] + if metric == 'sqeuclidean': + m_ij = (u[i] - v[j]) ** 2 + elif metric == 'cityblock' or metric == 'euclidean': + m_ij = np.abs(u[i] - v[j]) + else: + m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), + metric=metric)[0, 0] if w_i < w_j or j == m - 1: cost += m_ij * w_i - G[i, j] = w_i + G[cur_idx] = w_i + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j i += 1 w_j -= w_i w_i = u_weights[i] else: cost += m_ij * w_j - G[i, j] = w_j + G[cur_idx] = w_j + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j j += 1 w_i -= w_j w_j = v_weights[j] - return G, cost \ No newline at end of file + cur_idx += 1 + return G[:cur_idx], indices[:cur_idx], cost diff --git a/test/test_ot.py b/test/test_ot.py index 7008002..2a2e0a5 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -7,6 +7,7 @@ import warnings import numpy as np +from scipy.stats import wasserstein_distance import ot from ot.datasets import make_1D_gauss as gauss @@ -37,7 +38,7 @@ def test_emd_emd2(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn @@ -46,12 +47,13 @@ def test_emd_emd2(): np.testing.assert_allclose(w, 0) -def test_emd1d(): +def test_emd_1d_emd2_1d(): # test emd1d gives similar results as emd n = 20 m = 30 - u = np.random.randn(n, 1) - v = np.random.randn(m, 1) + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) M = ot.dist(u, v, metric='sqeuclidean') @@ -59,9 +61,20 @@ def test_emd1d(): wass = log["cost"] G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) wass1d = log["cost"] + wass1d_emd2 = ot.emd2_1d([], [], u, v, metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d([], [], u, v, metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) + np.testing.assert_allclose(wass, wass1d_emd2) + + # check loss is similar to scipy's implementation for Euclidean metric + wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, ))) + np.testing.assert_allclose(wass_sp, wass1d_euc) + + # check constraints + np.testing.assert_allclose(np.ones((n, )) / n, G.sum(1)) + np.testing.assert_allclose(np.ones((m, )) / m, G.sum(0)) # check G is similar np.testing.assert_allclose(G, G_1d) @@ -86,7 +99,7 @@ def test_emd_empty(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn -- cgit v1.2.3 From 67d3bd4bf0f593aa611d6bf09bbd3a9c883299ba Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 13:37:14 +0200 Subject: Removed np.abs in Cython code --- ot/lp/emd_wrap.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index ab88d7f..5e055fb 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -127,7 +127,7 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, if metric == 'sqeuclidean': m_ij = (u[i] - v[j]) ** 2 elif metric == 'cityblock' or metric == 'euclidean': - m_ij = np.abs(u[i] - v[j]) + m_ij = abs(u[i] - v[j]) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3 From 9e1d74f44473deb1f4766329bb0d1c8af4dfdd73 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 18:27:42 +0200 Subject: Started documenting --- ot/lp/__init__.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- test/test_ot.py | 8 +++--- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index decff29..e9635a1 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -313,10 +313,83 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix + + .. math:: + \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) + + s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 + where : + + - d is the metric + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm proposed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source histogram (uniform weight if empty list) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) ndarray, float64 + Target histogram (uniform weight if empty list) + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. + Due to implementation details, this function runs faster when + dense is set to False. + metric: str, optional (default='sqeuclidean') + Metric to be used. Has to be a string. + Due to implementation details, this function runs faster when + `'sqeuclidean'` or `'euclidean'` metrics are used. + log: boolean, optional (default=False) + If True, returns a dictionary containing the cost. + Otherwise returns only the optimal transportation matrix. + + Returns + ------- + gamma: (ns, nt) ndarray + Optimal transportation matrix for the given parameters + log: dict + If input log is True, a dictionary containing the cost + + + Examples + -------- + + Simple example with obvious solution. The function emd_1d accepts lists and + perform automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [0., 2.] + >>> x_b = [0., 3.] + >>> ot.emd_1d(a, b, x_a, x_b) + array([[ 0.5, 0. ], + [ 0. , 0.5]]) + + References + ---------- + + .. [1] TODO + + See Also + -------- + ot.lp.emd : EMD for multidimensional distributions + ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the + transportation matrix) + """ a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) @@ -353,7 +426,7 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): return G -def emd2_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss diff --git a/test/test_ot.py b/test/test_ot.py index 2a2e0a5..6d6ea26 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -59,10 +59,10 @@ def test_emd_1d_emd2_1d(): G, log = ot.emd([], [], M, log=True) wass = log["cost"] - G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) + G_1d, log = ot.emd_1d(u, v, [], [], metric='sqeuclidean', log=True) wass1d = log["cost"] - wass1d_emd2 = ot.emd2_1d([], [], u, v, metric='sqeuclidean', log=False) - wass1d_euc = ot.emd2_1d([], [], u, v, metric='euclidean', log=False) + wass1d_emd2 = ot.emd2_1d(u, v, [], [], metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d(u, v, [], [], metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) @@ -82,7 +82,7 @@ def test_emd_1d_emd2_1d(): # check AssertionError is raised if called on non 1d arrays u = np.random.randn(n, 2) v = np.random.randn(m, 2) - np.testing.assert_raises(AssertionError, ot.emd_1d, [], [], u, v) + np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], []) def test_emd_empty(): -- cgit v1.2.3 From 71f9b5adfb8d8f4481948391f22e49f45494d071 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 09:17:54 +0200 Subject: Added docstrings --- ot/lp/__init__.py | 114 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 22 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index e9635a1..a350d60 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -321,8 +321,8 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): .. math:: \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) - s.t. \gamma 1 = a - \gamma^T 1= b + s.t. \gamma 1 = a, + \gamma^T 1= b, \gamma\geq 0 where : @@ -330,28 +330,27 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): - x_a and x_b are the samples - a and b are the sample weights - Uses the algorithm proposed in [1]_ + Uses the algorithm detailed in [1]_ Parameters ---------- x_a : (ns,) or (ns, 1) ndarray, float64 - Source histogram (uniform weight if empty list) + Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 - Target histogram (uniform weight if empty list) + Target dirac locations (on the real line) a : (ns,) ndarray, float64 Source histogram (uniform weight if empty list) b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in ... are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` - format. - Due to implementation details, this function runs faster when + format. Due to implementation details, this function runs faster when dense is set to False. - metric: str, optional (default='sqeuclidean') - Metric to be used. Has to be a string. - Due to implementation details, this function runs faster when - `'sqeuclidean'` or `'euclidean'` metrics are used. log: boolean, optional (default=False) If True, returns a dictionary containing the cost. Otherwise returns only the optimal transportation matrix. @@ -368,28 +367,28 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): -------- Simple example with obvious solution. The function emd_1d accepts lists and - perform automatic conversion to numpy arrays + performs automatic conversion to numpy arrays >>> import ot >>> a=[.5, .5] >>> b=[.5, .5] - >>> x_a = [0., 2.] + >>> x_a = [2., 0.] >>> x_b = [0., 3.] - >>> ot.emd_1d(a, b, x_a, x_b) - array([[ 0.5, 0. ], - [ 0. , 0.5]]) + >>> ot.emd_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) References ---------- - .. [1] TODO + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. See Also -------- ot.lp.emd : EMD for multidimensional distributions ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the transportation matrix) - """ a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) @@ -418,10 +417,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), shape=(a.shape[0], b.shape[0])) if dense: - G = G.todense() + G = G.toarray() if log: - log = {} - log['cost'] = cost + log = {'cost': cost} return G, log return G @@ -430,10 +428,82 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss + + .. math:: + \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - d is the metric + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) ndarray, float64 + Target histogram (uniform weight if empty list) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in ... are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Only used if log is set to True. Due to implementation details, + this function runs faster when dense is set to False. + log: boolean, optional (default=False) + If True, returns a dictionary containing the transportation matrix. + Otherwise returns only the loss. + + Returns + ------- + loss: float + Cost associated to the optimal transportation + log: dict + If input log is True, a dictionary containing the Optimal transportation + matrix for the given parameters + + + Examples + -------- + + Simple example with obvious solution. The function emd2_1d accepts lists and + performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.emd2_1d(x_a, x_b, a, b) + 0.5 + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd2 : EMD for multidimensional distributions + ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix + instead of the cost) """ # If we do not return G (log==False), then we should not to cast it to dense # (useless overhead) - G, log_emd = emd_1d(a=a, b=b, x_a=x_a, x_b=x_b, metric=metric, + G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, dense=dense and log, log=True) cost = log_emd['cost'] if log: -- cgit v1.2.3 From 77452dd92f607c3f18a6420cb8cd09fa5cd905a6 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 13:09:36 +0200 Subject: Added more docstrings (Cython) + fixed link to ot.dist doc --- ot/lp/__init__.py | 4 ++-- ot/lp/emd_wrap.pyx | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index a350d60..645ed8b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -343,7 +343,7 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) metric: str, optional (default='sqeuclidean') - Metric to be used. Only strings listed in ... are accepted. + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) @@ -454,7 +454,7 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) metric: str, optional (default='sqeuclidean') - Metric to be used. Only strings listed in ... are accepted. + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 5e055fb..2825ba2 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -105,7 +105,33 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v, str metric='sqeuclidean'): r""" - Roro's stuff + Solves the Earth Movers distance problem between sorted 1d measures and + returns the OT matrix and the associated cost + + Parameters + ---------- + u_weights : (ns,) ndarray, float64 + Source histogram + v_weights : (nt,) ndarray, float64 + Target histogram + u : (ns,) ndarray, float64 + Source dirac locations (on the real line) + v : (nt,) ndarray, float64 + Target dirac locations (on the real line) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + + Returns + ------- + gamma: (n, ) ndarray, float64 + Values in the Optimal transportation matrix + indices: (n, 2) ndarray, int64 + Indices of the values stored in gamma for the Optimal transportation + matrix + cost + cost associated to the optimal transportation """ cdef double cost = 0. cdef int n = u_weights.shape[0] -- cgit v1.2.3 From 0a039eb07a3ca9ae3c5635cca1719428f62bf67d Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 13:15:38 +0200 Subject: Made weight vectors optional to match scipy's wass1d API --- ot/lp/__init__.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 645ed8b..bf218d3 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -313,7 +313,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix @@ -338,10 +338,10 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 Target dirac locations (on the real line) - a : (ns,) ndarray, float64 - Source histogram (uniform weight if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when @@ -375,6 +375,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.emd_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) + >>> ot.emd_1d(x_a, x_b) array([[0. , 0.5], [0.5, 0. ]]) @@ -401,9 +404,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): "emd_1d should only be used with monodimensional data" # if empty array given then use uniform distributions - if len(a) == 0: + if a.ndim == 0 or len(a) == 0: a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0] - if len(b) == 0: + if b.ndim == 0 or len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] x_a_1d = x_a.reshape((-1, )) @@ -424,7 +427,7 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): return G -def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss @@ -449,10 +452,10 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 Target dirac locations (on the real line) - a : (ns,) ndarray, float64 - Source histogram (uniform weight if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when @@ -488,6 +491,8 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): >>> x_b = [0., 3.] >>> ot.emd2_1d(x_a, x_b, a, b) 0.5 + >>> ot.emd2_1d(x_a, x_b) + 0.5 References ---------- -- cgit v1.2.3 From 1140141938c678d267f688dbb9106d3422d633c5 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 10:04:35 +0200 Subject: Added minkowski variants and wasserstein_1d functions --- ot/lp/__init__.py | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++--- ot/lp/emd_wrap.pyx | 10 ++- 2 files changed, 203 insertions(+), 10 deletions(-) diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index bf218d3..719032b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -21,7 +21,7 @@ from .cvx import barycenter from ..utils import dist __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d'] + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -313,7 +313,8 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, + log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix @@ -330,6 +331,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False - x_a and x_b are the samples - a and b are the sample weights + When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`. + Uses the algorithm detailed in [1]_ Parameters @@ -346,11 +349,14 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. Due to implementation details, this function runs faster when - dense is set to False. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. log: boolean, optional (default=False) If True, returns a dictionary containing the cost. Otherwise returns only the optimal transportation matrix. @@ -416,7 +422,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False G_sorted, indices, cost = emd_1d_sorted(a, b, x_a_1d[perm_a], x_b_1d[perm_b], - metric=metric) + metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), shape=(a.shape[0], b.shape[0])) if dense: @@ -427,7 +433,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False return G -def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, + log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss @@ -444,6 +451,8 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals - x_a and x_b are the samples - a and b are the sample weights + When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`. + Uses the algorithm detailed in [1]_ Parameters @@ -459,7 +468,10 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` @@ -508,10 +520,185 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals """ # If we do not return G (log==False), then we should not to cast it to dense # (useless overhead) - G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, + G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p, dense=dense and log, log=True) cost = log_emd['cost'] if log: log_emd = {'G': G} return cost, log_emd - return cost \ No newline at end of file + return cost + + +def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): + """Solves the Wasserstein distance problem between 1d measures and returns + the OT matrix + + + .. math:: + \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \right)^{1/p} + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) + p: float, optional (default=1.0) + The order of the p-Wasserstein distance to be computed + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + log: boolean, optional (default=False) + If True, returns a dictionary containing the cost. + Otherwise returns only the optimal transportation matrix. + + Returns + ------- + gamma: (ns, nt) ndarray + Optimal transportation matrix for the given parameters + log: dict + If input log is True, a dictionary containing the cost + + + Examples + -------- + + Simple example with obvious solution. The function wasserstein_1d accepts + lists and performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.wasserstein_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) + >>> ot.wasserstein_1d(x_a, x_b) + array([[0. , 0.5], + [0.5, 0. ]]) + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd_1d : EMD for 1d distributions + ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost + instead of the transportation matrix) + """ + if log: + G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + log['cost'] = np.power(log['cost'], 1. / p) + return G, log + return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + + +def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., + dense=True, log=False): + """Solves the Wasserstein distance problem between 1d measures and returns + the loss + + + .. math:: + \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \right)^{1/p} + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) + p: float, optional (default=1.0) + The order of the p-Wasserstein distance to be computed + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Only used if log is set to True. Due to implementation details, + this function runs faster when dense is set to False. + log: boolean, optional (default=False) + If True, returns a dictionary containing the transportation matrix. + Otherwise returns only the loss. + + Returns + ------- + loss: float + Cost associated to the optimal transportation + log: dict + If input log is True, a dictionary containing the Optimal transportation + matrix for the given parameters + + + Examples + -------- + + Simple example with obvious solution. The function wasserstein2_1d accepts + lists and performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.wasserstein2_1d(x_a, x_b, a, b) + 0.5 + >>> ot.wasserstein2_1d(x_a, x_b) + 0.5 + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd2_1d : EMD for 1d distributions + ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the + transportation matrix instead of the cost) + """ + if log: + cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + cost = np.power(cost, 1. / p) + return cost, log + return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log), 1. / p) \ No newline at end of file diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2825ba2..7134136 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -103,7 +103,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, np.ndarray[double, ndim=1, mode="c"] u, np.ndarray[double, ndim=1, mode="c"] v, - str metric='sqeuclidean'): + str metric='sqeuclidean', + double p=1.): r""" Solves the Earth Movers distance problem between sorted 1d measures and returns the OT matrix and the associated cost @@ -121,7 +122,10 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' Returns ------- @@ -154,6 +158,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, m_ij = (u[i] - v[j]) ** 2 elif metric == 'cityblock' or metric == 'euclidean': m_ij = abs(u[i] - v[j]) + elif metric == 'minkowski': + m_ij = abs(u[i] - v[j]) ** p else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3 From 0d333e004636f5d25edea6bb195e8e4d9a95ba98 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 10:23:32 +0200 Subject: Improved tests and docs for wasserstein_1d --- ot/__init__.py | 5 +++-- ot/lp/__init__.py | 13 ++++++------- ot/lp/emd_wrap.pyx | 3 ++- test/test_ot.py | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/ot/__init__.py b/ot/__init__.py index f0e526c..5bd9bb3 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 719032b..76c9ec0 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -530,13 +530,13 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns + """Solves the p-Wasserstein distance problem between 1d measures and returns the OT matrix .. math:: - \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, @@ -617,15 +617,14 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): dense=dense, log=log) -def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., - dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns +def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): + """Solves the p-Wasserstein distance problem between 1d measures and returns the loss .. math:: \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 7134136..42b848f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -13,6 +13,7 @@ cimport numpy as np from ..utils import dist cimport cython +cimport libc.math as math import warnings @@ -159,7 +160,7 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, elif metric == 'cityblock' or metric == 'euclidean': m_ij = abs(u[i] - v[j]) elif metric == 'minkowski': - m_ij = abs(u[i] - v[j]) ** p + m_ij = math.pow(abs(u[i] - v[j]), p) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] diff --git a/test/test_ot.py b/test/test_ot.py index 6d6ea26..48423e7 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -85,6 +85,29 @@ def test_emd_1d_emd2_1d(): np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], []) +def test_wass_1d(): + # test emd1d gives similar results as emd + n = 20 + m = 30 + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd([], [], M, log=True) + wass = log["cost"] + + G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) + wass1d = log["cost"] + + # check loss is similar + np.testing.assert_allclose(np.sqrt(wass), wass1d) + + # check G is similar + np.testing.assert_allclose(G, G_1d) + + def test_emd_empty(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From c92e595009ad5e2ae6d4b2c040556cffb6316847 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 11:08:15 +0200 Subject: Wasserstein defined as the cost itself (do not return transportation matrix) --- ot/__init__.py | 4 +- ot/lp/__init__.py | 125 +++++------------------------------------------------- test/test_ot.py | 6 +-- 3 files changed, 13 insertions(+), 122 deletions(-) diff --git a/ot/__init__.py b/ot/__init__.py index 730aa4f..1b3c2fb 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -23,7 +23,7 @@ from . import stochastic from . import unbalanced # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced from .da import sinkhorn_lpl1_mm @@ -35,6 +35,6 @@ __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim', 'sinkhorn_unbalanced', "barycenter_unbalanced"] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 76c9ec0..a3f5b8d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -21,7 +21,7 @@ from .cvx import barycenter from ..utils import dist __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d'] + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -529,9 +529,9 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, return cost -def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): +def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.): """Solves the p-Wasserstein distance problem between 1d measures and returns - the OT matrix + the distance .. math:: @@ -560,22 +560,11 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): Target histogram (default is uniform weight) p: float, optional (default=1.0) The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics - are used. - log: boolean, optional (default=False) - If True, returns a dictionary containing the cost. - Otherwise returns only the optimal transportation matrix. Returns ------- - gamma: (ns, nt) ndarray - Optimal transportation matrix for the given parameters - log: dict - If input log is True, a dictionary containing the cost + dist: float + p-Wasserstein distance Examples @@ -590,96 +579,8 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.wasserstein_1d(x_a, x_b, a, b) - array([[0. , 0.5], - [0.5, 0. ]]) - >>> ot.wasserstein_1d(x_a, x_b) - array([[0. , 0.5], - [0.5, 0. ]]) - - References - ---------- - - .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal - Transport", 2018. - - See Also - -------- - ot.lp.emd_1d : EMD for 1d distributions - ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost - instead of the transportation matrix) - """ - if log: - G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - log['cost'] = np.power(log['cost'], 1. / p) - return G, log - return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - - -def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the p-Wasserstein distance problem between 1d measures and returns - the loss - - - .. math:: - \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \\right)^{1/p} - - s.t. \gamma 1 = a, - \gamma^T 1= b, - \gamma\geq 0 - where : - - - x_a and x_b are the samples - - a and b are the sample weights - - Uses the algorithm detailed in [1]_ - - Parameters - ---------- - x_a : (ns,) or (ns, 1) ndarray, float64 - Source dirac locations (on the real line) - x_b : (nt,) or (ns, 1) ndarray, float64 - Target dirac locations (on the real line) - a : (ns,) ndarray, float64, optional - Source histogram (default is uniform weight) - b : (nt,) ndarray, float64, optional - Target histogram (default is uniform weight) - p: float, optional (default=1.0) - The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Only used if log is set to True. Due to implementation details, - this function runs faster when dense is set to False. - log: boolean, optional (default=False) - If True, returns a dictionary containing the transportation matrix. - Otherwise returns only the loss. - - Returns - ------- - loss: float - Cost associated to the optimal transportation - log: dict - If input log is True, a dictionary containing the Optimal transportation - matrix for the given parameters - - - Examples - -------- - - Simple example with obvious solution. The function wasserstein2_1d accepts - lists and performs automatic conversion to numpy arrays - - >>> import ot - >>> a=[.5, .5] - >>> b=[.5, .5] - >>> x_a = [2., 0.] - >>> x_b = [0., 3.] - >>> ot.wasserstein2_1d(x_a, x_b, a, b) 0.5 - >>> ot.wasserstein2_1d(x_a, x_b) + >>> ot.wasserstein_1d(x_a, x_b) 0.5 References @@ -690,14 +591,8 @@ def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): See Also -------- - ot.lp.emd2_1d : EMD for 1d distributions - ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the - transportation matrix instead of the cost) + ot.lp.emd_1d : EMD for 1d distributions """ - if log: - cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - cost = np.power(cost, 1. / p) - return cost, log - return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log), 1. / p) \ No newline at end of file + cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=False, log=False) + return np.power(cost_emd, 1. / p) diff --git a/test/test_ot.py b/test/test_ot.py index 48423e7..3c4ac11 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -98,15 +98,11 @@ def test_wass_1d(): G, log = ot.emd([], [], M, log=True) wass = log["cost"] - G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) - wass1d = log["cost"] + wass1d = ot.wasserstein_1d(u, v, [], [], p=2.) # check loss is similar np.testing.assert_allclose(np.sqrt(wass), wass1d) - # check G is similar - np.testing.assert_allclose(G, G_1d) - def test_emd_empty(): # test emd and emd2 for simple identity -- cgit v1.2.3 From 362a7f8fa20cf7ae6f2e36d7e47c7ca9f81d3c51 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 13:29:19 +0200 Subject: Added RT as a contributor + "optimized" Cython math operations --- README.md | 1 + ot/lp/emd_wrap.pyx | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d24d8b9..84148f8 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,7 @@ The contributors to this library are: * [Alain Rakotomamonjy](https://sites.google.com/site/alainrakotomamonjy/home) * [Vayer Titouan](https://tvayer.github.io/) * [Hicham Janati](https://hichamjanati.github.io/) (Unbalanced OT) +* [Romain Tavenard](https://rtavenar.github.io/) (1d Wasserstein) This toolbox benefit a lot from open source research and we would like to thank the following persons for providing some code (in various languages): diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 42b848f..8a4aec9 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -156,11 +156,11 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef int cur_idx = 0 while i < n and j < m: if metric == 'sqeuclidean': - m_ij = (u[i] - v[j]) ** 2 + m_ij = (u[i] - v[j]) * (u[i] - v[j]) elif metric == 'cityblock' or metric == 'euclidean': - m_ij = abs(u[i] - v[j]) + m_ij = math.fabs(u[i] - v[j]) elif metric == 'minkowski': - m_ij = math.pow(abs(u[i] - v[j]), p) + m_ij = math.pow(math.fabs(u[i] - v[j]), p) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3