From 1171f7e39742c207dad6ab5fd15f59ed62f8f4a5 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Thu, 6 Jun 2019 17:22:05 +0200 Subject: start documentation ot --- ot/lp/__init__.py | 47 +++++++++++++++++++++++++++++------------------ ot/lp/emd_wrap.pyx | 11 +++++++---- 2 files changed, 36 insertions(+), 22 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 02cbd8c..ed6fa52 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- """ Solvers for the original linear program OT problem + + + """ # Author: Remi Flamary @@ -37,26 +40,30 @@ def emd(a, b, M, numItermax=100000, log=False): - M is the metric cost matrix - a and b are the sample weights + .. warning:: + Note that the M matrix needs to be a C-order numpy.array in float64 + format. + Uses the algorithm proposed in [1]_ Parameters ---------- - a : (ns,) ndarray, float64 - Source histogram (uniform weigth if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weigth if empty list) - M : (ns,nt) ndarray, float64 - loss matrix + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + M : (ns,nt) numpy.ndarray, float64 + Loss matrix (c-order array with type float64) numItermax : int, optional (default=100000) The maximum number of iterations before stopping the optimization algorithm if it has not converged. - log: boolean, optional (default=False) + log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. Returns ------- - gamma: (ns x nt) ndarray + gamma: (ns x nt) numpy.ndarray Optimal transportation matrix for the given parameters log: dict If input log is true, a dictionary containing the cost and dual @@ -128,16 +135,20 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), - M is the metric cost matrix - a and b are the sample weights + .. warning:: + Note that the M matrix needs to be a C-order numpy.array in float64 + format. + Uses the algorithm proposed in [1]_ Parameters ---------- - a : (ns,) ndarray, float64 - Source histogram (uniform weigth if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weigth if empty list) - M : (ns,nt) ndarray, float64 - loss matrix + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + M : (ns,nt) numpy.ndarray, float64 + Loss matrix (c-order array with type float64) numItermax : int, optional (default=100000) The maximum number of iterations before stopping the optimization algorithm if it has not converged. @@ -151,7 +162,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), ------- gamma: (ns x nt) ndarray Optimal transportation matrix for the given parameters - log: dict + log: dictnp If input log is true, a dictionary containing the cost and dual variables and exit status @@ -231,9 +242,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None Parameters ---------- - measures_locations : list of (k_i,d) np.ndarray + measures_locations : list of (k_i,d) numpy.ndarray The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list) - measures_weights : list of (k_i,) np.ndarray + measures_weights : list of (k_i,) numpy.ndarray Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure X_init : (k,d) np.ndarray @@ -246,7 +257,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None numItermax : int, optional Max number of iterations stopThr : float, optional - Stop threshol on error (>0) + Stop threshold on error (>0) verbose : bool, optional Print information along iterations log : bool, optional diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 83ee6aa..edb5f7c 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -55,13 +55,16 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod - M is the metric cost matrix - a and b are the sample weights + .. warning:: + Note that the M matrix needs to be a C-order :py.cls:`numpy.array` + Parameters ---------- - a : (ns,) ndarray, float64 + a : (ns,) numpy.ndarray, float64 source histogram - b : (nt,) ndarray, float64 + b : (nt,) numpy.ndarray, float64 target histogram - M : (ns,nt) ndarray, float64 + M : (ns,nt) numpy.ndarray, float64 loss matrix max_iter : int The maximum number of iterations before stopping the optimization @@ -70,7 +73,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod Returns ------- - gamma: (ns x nt) ndarray + gamma: (ns x nt) numpy.ndarray Optimal transportation matrix for the given parameters """ -- cgit v1.2.3 From f63f34f8adb6943b6410f8b773b4b4d8f1c7b4ba Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 20 Jun 2019 14:29:56 +0200 Subject: EMD 1d without doc --- ot/__init__.py | 4 ++-- ot/lp/__init__.py | 43 ++++++++++++++++++++++++++++++++++++++----- ot/lp/emd_wrap.pyx | 35 +++++++++++++++++++++++++++++++++++ test/test_ot.py | 26 ++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 7 deletions(-) (limited to 'ot/lp') diff --git a/ot/__init__.py b/ot/__init__.py index b74b924..5d5b700 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2 +from .lp import emd, emd2, emd_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -31,6 +31,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" -__all__ = ["emd", "emd2", "sinkhorn", "sinkhorn2", "utils", 'datasets', +__all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 02cbd8c..49ded5b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -14,12 +14,12 @@ import numpy as np from .import cvx # import compiled emd -from .emd_wrap import emd_c, check_result +from .emd_wrap import emd_c, check_result, emd_1d_sorted from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx'] +__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d_sorted'] def emd(a, b, M, numItermax=100000, log=False): @@ -94,7 +94,7 @@ def emd(a, b, M, numItermax=100000, log=False): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use unifor distributions + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] if len(b) == 0: @@ -187,7 +187,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use unifor distributions + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] if len(b) == 0: @@ -308,4 +308,37 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None log_dict['displacement_square_norms'] = displacement_square_norms return X, log_dict else: - return X \ No newline at end of file + return X + + +def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): + """Solves the Earth Movers distance problem between 1d measures and returns + the OT matrix + + """ + assert x_a.shape[1] == x_b.shape[1] == 1, "emd_1d should only be used " + \ + "with monodimensional data" + + a = np.asarray(a, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + + # if empty array given then use uniform distributions + if len(a) == 0: + a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0] + if len(b) == 0: + b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] + + perm_a = np.argsort(x_a.reshape((-1, ))) + perm_b = np.argsort(x_b.reshape((-1, ))) + inv_perm_a = np.argsort(perm_a) + inv_perm_b = np.argsort(perm_b) + + M = dist(x_a[perm_a], x_b[perm_b], metric=metric) + + G_sorted, cost = emd_1d_sorted(a, b, M) + G = G_sorted[inv_perm_a, :][:, inv_perm_b] + if log: + log = {} + log['cost'] = cost + return G, log + return G \ No newline at end of file diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 83ee6aa..a3d189d 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -93,3 +93,38 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod cdef int result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) return G, cost, alpha, beta, result_code + + +@cython.boundscheck(False) +@cython.wraparound(False) +def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, + np.ndarray[double, ndim=1, mode="c"] v_weights, + np.ndarray[double, ndim=2, mode="c"] M): + r""" + Roro's stuff + """ + cdef double cost = 0. + cdef int n = u_weights.shape[0] + cdef int m = v_weights.shape[0] + + cdef int i = 0 + cdef double w_i = u_weights[0] + cdef int j = 0 + cdef double w_j = v_weights[0] + + cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), + dtype=np.float64) + while i < n and j < m: + if w_i < w_j or j == m - 1: + cost += M[i, j] * w_i + G[i, j] = w_i + i += 1 + w_j -= w_i + w_i = u_weights[i] + else: + cost += M[i, j] * w_j + G[i, j] = w_j + j += 1 + w_i -= w_j + w_j = v_weights[j] + return G, cost \ No newline at end of file diff --git a/test/test_ot.py b/test/test_ot.py index 7652394..7008002 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -46,6 +46,32 @@ def test_emd_emd2(): np.testing.assert_allclose(w, 0) +def test_emd1d(): + # test emd1d gives similar results as emd + n = 20 + m = 30 + u = np.random.randn(n, 1) + v = np.random.randn(m, 1) + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd([], [], M, log=True) + wass = log["cost"] + G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) + wass1d = log["cost"] + + # check loss is similar + np.testing.assert_allclose(wass, wass1d) + + # check G is similar + np.testing.assert_allclose(G, G_1d) + + # check AssertionError is raised if called on non 1d arrays + u = np.random.randn(n, 2) + v = np.random.randn(m, 2) + np.testing.assert_raises(AssertionError, ot.emd_1d, [], [], u, v) + + def test_emd_empty(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From 15b21611a3a93043d30c4eaaf9d622200453a884 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 20 Jun 2019 14:52:23 +0200 Subject: EMD 1d without doc made faster --- ot/lp/__init__.py | 5 ++--- ot/lp/emd_wrap.pyx | 14 +++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 49ded5b..c4457dc 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -333,9 +333,8 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): inv_perm_a = np.argsort(perm_a) inv_perm_b = np.argsort(perm_b) - M = dist(x_a[perm_a], x_b[perm_b], metric=metric) - - G_sorted, cost = emd_1d_sorted(a, b, M) + G_sorted, cost = emd_1d_sorted(a, b, x_a[perm_a], x_b[perm_b], + metric=metric) G = G_sorted[inv_perm_a, :][:, inv_perm_b] if log: log = {} diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index a3d189d..2966206 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -10,6 +10,8 @@ Cython linker with C solver import numpy as np cimport numpy as np +from ..utils import dist + cimport cython import warnings @@ -99,7 +101,9 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod @cython.wraparound(False) def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, - np.ndarray[double, ndim=2, mode="c"] M): + np.ndarray[double, ndim=2, mode="c"] u, + np.ndarray[double, ndim=2, mode="c"] v, + str metric='sqeuclidean'): r""" Roro's stuff """ @@ -112,17 +116,21 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef int j = 0 cdef double w_j = v_weights[0] + cdef double m_ij = 0. + cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), dtype=np.float64) while i < n and j < m: + m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), + metric=metric)[0, 0] if w_i < w_j or j == m - 1: - cost += M[i, j] * w_i + cost += m_ij * w_i G[i, j] = w_i i += 1 w_j -= w_i w_i = u_weights[i] else: - cost += M[i, j] * w_j + cost += m_ij * w_j G[i, j] = w_j j += 1 w_i -= w_j -- cgit v1.2.3 From cada9a3019997e8efb95d96c86985110f1e937b9 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 11:15:42 +0200 Subject: Sparse G matrix for EMD1d + standard metrics computed without cdist --- ot/lp/__init__.py | 54 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 14 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index c4457dc..decff29 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -10,6 +10,7 @@ Solvers for the original linear program OT problem import multiprocessing import numpy as np +from scipy.sparse import coo_matrix from .import cvx @@ -19,7 +20,8 @@ from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d_sorted'] +__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', + 'emd_1d', 'emd2_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -311,16 +313,20 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): +def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix """ - assert x_a.shape[1] == x_b.shape[1] == 1, "emd_1d should only be used " + \ - "with monodimensional data" - a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) + x_a = np.asarray(x_a, dtype=np.float64) + x_b = np.asarray(x_b, dtype=np.float64) + + assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \ + "emd_1d should only be used with monodimensional data" + assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \ + "emd_1d should only be used with monodimensional data" # if empty array given then use uniform distributions if len(a) == 0: @@ -328,16 +334,36 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', log=False): if len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] - perm_a = np.argsort(x_a.reshape((-1, ))) - perm_b = np.argsort(x_b.reshape((-1, ))) - inv_perm_a = np.argsort(perm_a) - inv_perm_b = np.argsort(perm_b) - - G_sorted, cost = emd_1d_sorted(a, b, x_a[perm_a], x_b[perm_b], - metric=metric) - G = G_sorted[inv_perm_a, :][:, inv_perm_b] + x_a_1d = x_a.reshape((-1, )) + x_b_1d = x_b.reshape((-1, )) + perm_a = np.argsort(x_a_1d) + perm_b = np.argsort(x_b_1d) + + G_sorted, indices, cost = emd_1d_sorted(a, b, + x_a_1d[perm_a], x_b_1d[perm_b], + metric=metric) + G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), + shape=(a.shape[0], b.shape[0])) + if dense: + G = G.todense() if log: log = {} log['cost'] = cost return G, log - return G \ No newline at end of file + return G + + +def emd2_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): + """Solves the Earth Movers distance problem between 1d measures and returns + the loss + + """ + # If we do not return G (log==False), then we should not to cast it to dense + # (useless overhead) + G, log_emd = emd_1d(a=a, b=b, x_a=x_a, x_b=x_b, metric=metric, + dense=dense and log, log=True) + cost = log_emd['cost'] + if log: + log_emd = {'G': G} + return cost, log_emd + return cost \ No newline at end of file -- cgit v1.2.3 From 18502d6861a4977cbade957f2e48eeb8dbb55414 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 11:21:08 +0200 Subject: Sparse G matrix for EMD1d + standard metrics computed without cdist --- ot/__init__.py | 4 ++-- ot/lp/emd_wrap.pyx | 29 +++++++++++++++++++++-------- test/test_ot.py | 23 ++++++++++++++++++----- 3 files changed, 41 insertions(+), 15 deletions(-) (limited to 'ot/lp') diff --git a/ot/__init__.py b/ot/__init__.py index 5d5b700..f0e526c 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d +from .lp import emd, emd2, emd_1d, emd2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,5 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2966206..ab88d7f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -101,8 +101,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod @cython.wraparound(False) def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, - np.ndarray[double, ndim=2, mode="c"] u, - np.ndarray[double, ndim=2, mode="c"] v, + np.ndarray[double, ndim=1, mode="c"] u, + np.ndarray[double, ndim=1, mode="c"] v, str metric='sqeuclidean'): r""" Roro's stuff @@ -118,21 +118,34 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef double m_ij = 0. - cdef np.ndarray[double, ndim=2, mode="c"] G = np.zeros((n, m), + cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ), dtype=np.float64) + cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2), + dtype=np.int) + cdef int cur_idx = 0 while i < n and j < m: - m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), - metric=metric)[0, 0] + if metric == 'sqeuclidean': + m_ij = (u[i] - v[j]) ** 2 + elif metric == 'cityblock' or metric == 'euclidean': + m_ij = np.abs(u[i] - v[j]) + else: + m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), + metric=metric)[0, 0] if w_i < w_j or j == m - 1: cost += m_ij * w_i - G[i, j] = w_i + G[cur_idx] = w_i + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j i += 1 w_j -= w_i w_i = u_weights[i] else: cost += m_ij * w_j - G[i, j] = w_j + G[cur_idx] = w_j + indices[cur_idx, 0] = i + indices[cur_idx, 1] = j j += 1 w_i -= w_j w_j = v_weights[j] - return G, cost \ No newline at end of file + cur_idx += 1 + return G[:cur_idx], indices[:cur_idx], cost diff --git a/test/test_ot.py b/test/test_ot.py index 7008002..2a2e0a5 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -7,6 +7,7 @@ import warnings import numpy as np +from scipy.stats import wasserstein_distance import ot from ot.datasets import make_1D_gauss as gauss @@ -37,7 +38,7 @@ def test_emd_emd2(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn @@ -46,12 +47,13 @@ def test_emd_emd2(): np.testing.assert_allclose(w, 0) -def test_emd1d(): +def test_emd_1d_emd2_1d(): # test emd1d gives similar results as emd n = 20 m = 30 - u = np.random.randn(n, 1) - v = np.random.randn(m, 1) + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) M = ot.dist(u, v, metric='sqeuclidean') @@ -59,9 +61,20 @@ def test_emd1d(): wass = log["cost"] G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) wass1d = log["cost"] + wass1d_emd2 = ot.emd2_1d([], [], u, v, metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d([], [], u, v, metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) + np.testing.assert_allclose(wass, wass1d_emd2) + + # check loss is similar to scipy's implementation for Euclidean metric + wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, ))) + np.testing.assert_allclose(wass_sp, wass1d_euc) + + # check constraints + np.testing.assert_allclose(np.ones((n, )) / n, G.sum(1)) + np.testing.assert_allclose(np.ones((m, )) / m, G.sum(0)) # check G is similar np.testing.assert_allclose(G, G_1d) @@ -86,7 +99,7 @@ def test_emd_empty(): # check G is identity np.testing.assert_allclose(G, np.eye(n) / n) - # check constratints + # check constraints np.testing.assert_allclose(u, G.sum(1)) # cf convergence sinkhorn np.testing.assert_allclose(u, G.sum(0)) # cf convergence sinkhorn -- cgit v1.2.3 From 67d3bd4bf0f593aa611d6bf09bbd3a9c883299ba Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 13:37:14 +0200 Subject: Removed np.abs in Cython code --- ot/lp/emd_wrap.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ot/lp') diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index ab88d7f..5e055fb 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -127,7 +127,7 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, if metric == 'sqeuclidean': m_ij = (u[i] - v[j]) ** 2 elif metric == 'cityblock' or metric == 'euclidean': - m_ij = np.abs(u[i] - v[j]) + m_ij = abs(u[i] - v[j]) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3 From 9e1d74f44473deb1f4766329bb0d1c8af4dfdd73 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Fri, 21 Jun 2019 18:27:42 +0200 Subject: Started documenting --- ot/lp/__init__.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- test/test_ot.py | 8 +++--- 2 files changed, 79 insertions(+), 6 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index decff29..e9635a1 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -313,10 +313,83 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix + + .. math:: + \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) + + s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 + where : + + - d is the metric + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm proposed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source histogram (uniform weight if empty list) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) ndarray, float64 + Target histogram (uniform weight if empty list) + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. + Due to implementation details, this function runs faster when + dense is set to False. + metric: str, optional (default='sqeuclidean') + Metric to be used. Has to be a string. + Due to implementation details, this function runs faster when + `'sqeuclidean'` or `'euclidean'` metrics are used. + log: boolean, optional (default=False) + If True, returns a dictionary containing the cost. + Otherwise returns only the optimal transportation matrix. + + Returns + ------- + gamma: (ns, nt) ndarray + Optimal transportation matrix for the given parameters + log: dict + If input log is True, a dictionary containing the cost + + + Examples + -------- + + Simple example with obvious solution. The function emd_1d accepts lists and + perform automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [0., 2.] + >>> x_b = [0., 3.] + >>> ot.emd_1d(a, b, x_a, x_b) + array([[ 0.5, 0. ], + [ 0. , 0.5]]) + + References + ---------- + + .. [1] TODO + + See Also + -------- + ot.lp.emd : EMD for multidimensional distributions + ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the + transportation matrix) + """ a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) @@ -353,7 +426,7 @@ def emd_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): return G -def emd2_1d(a, b, x_a, x_b, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss diff --git a/test/test_ot.py b/test/test_ot.py index 2a2e0a5..6d6ea26 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -59,10 +59,10 @@ def test_emd_1d_emd2_1d(): G, log = ot.emd([], [], M, log=True) wass = log["cost"] - G_1d, log = ot.emd_1d([], [], u, v, metric='sqeuclidean', log=True) + G_1d, log = ot.emd_1d(u, v, [], [], metric='sqeuclidean', log=True) wass1d = log["cost"] - wass1d_emd2 = ot.emd2_1d([], [], u, v, metric='sqeuclidean', log=False) - wass1d_euc = ot.emd2_1d([], [], u, v, metric='euclidean', log=False) + wass1d_emd2 = ot.emd2_1d(u, v, [], [], metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d(u, v, [], [], metric='euclidean', log=False) # check loss is similar np.testing.assert_allclose(wass, wass1d) @@ -82,7 +82,7 @@ def test_emd_1d_emd2_1d(): # check AssertionError is raised if called on non 1d arrays u = np.random.randn(n, 2) v = np.random.randn(m, 2) - np.testing.assert_raises(AssertionError, ot.emd_1d, [], [], u, v) + np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], []) def test_emd_empty(): -- cgit v1.2.3 From 71f9b5adfb8d8f4481948391f22e49f45494d071 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 09:17:54 +0200 Subject: Added docstrings --- ot/lp/__init__.py | 114 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 22 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index e9635a1..a350d60 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -321,8 +321,8 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): .. math:: \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) - s.t. \gamma 1 = a - \gamma^T 1= b + s.t. \gamma 1 = a, + \gamma^T 1= b, \gamma\geq 0 where : @@ -330,28 +330,27 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): - x_a and x_b are the samples - a and b are the sample weights - Uses the algorithm proposed in [1]_ + Uses the algorithm detailed in [1]_ Parameters ---------- x_a : (ns,) or (ns, 1) ndarray, float64 - Source histogram (uniform weight if empty list) + Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 - Target histogram (uniform weight if empty list) + Target dirac locations (on the real line) a : (ns,) ndarray, float64 Source histogram (uniform weight if empty list) b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in ... are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` - format. - Due to implementation details, this function runs faster when + format. Due to implementation details, this function runs faster when dense is set to False. - metric: str, optional (default='sqeuclidean') - Metric to be used. Has to be a string. - Due to implementation details, this function runs faster when - `'sqeuclidean'` or `'euclidean'` metrics are used. log: boolean, optional (default=False) If True, returns a dictionary containing the cost. Otherwise returns only the optimal transportation matrix. @@ -368,28 +367,28 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): -------- Simple example with obvious solution. The function emd_1d accepts lists and - perform automatic conversion to numpy arrays + performs automatic conversion to numpy arrays >>> import ot >>> a=[.5, .5] >>> b=[.5, .5] - >>> x_a = [0., 2.] + >>> x_a = [2., 0.] >>> x_b = [0., 3.] - >>> ot.emd_1d(a, b, x_a, x_b) - array([[ 0.5, 0. ], - [ 0. , 0.5]]) + >>> ot.emd_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) References ---------- - .. [1] TODO + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. See Also -------- ot.lp.emd : EMD for multidimensional distributions ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the transportation matrix) - """ a = np.asarray(a, dtype=np.float64) b = np.asarray(b, dtype=np.float64) @@ -418,10 +417,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), shape=(a.shape[0], b.shape[0])) if dense: - G = G.todense() + G = G.toarray() if log: - log = {} - log['cost'] = cost + log = {'cost': cost} return G, log return G @@ -430,10 +428,82 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss + + .. math:: + \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j]) + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - d is the metric + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) ndarray, float64 + Target histogram (uniform weight if empty list) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in ... are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Only used if log is set to True. Due to implementation details, + this function runs faster when dense is set to False. + log: boolean, optional (default=False) + If True, returns a dictionary containing the transportation matrix. + Otherwise returns only the loss. + + Returns + ------- + loss: float + Cost associated to the optimal transportation + log: dict + If input log is True, a dictionary containing the Optimal transportation + matrix for the given parameters + + + Examples + -------- + + Simple example with obvious solution. The function emd2_1d accepts lists and + performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.emd2_1d(x_a, x_b, a, b) + 0.5 + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd2 : EMD for multidimensional distributions + ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix + instead of the cost) """ # If we do not return G (log==False), then we should not to cast it to dense # (useless overhead) - G, log_emd = emd_1d(a=a, b=b, x_a=x_a, x_b=x_b, metric=metric, + G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, dense=dense and log, log=True) cost = log_emd['cost'] if log: -- cgit v1.2.3 From 77452dd92f607c3f18a6420cb8cd09fa5cd905a6 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 13:09:36 +0200 Subject: Added more docstrings (Cython) + fixed link to ot.dist doc --- ot/lp/__init__.py | 4 ++-- ot/lp/emd_wrap.pyx | 28 +++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index a350d60..645ed8b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -343,7 +343,7 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) metric: str, optional (default='sqeuclidean') - Metric to be used. Only strings listed in ... are accepted. + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) @@ -454,7 +454,7 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): b : (nt,) ndarray, float64 Target histogram (uniform weight if empty list) metric: str, optional (default='sqeuclidean') - Metric to be used. Only strings listed in ... are accepted. + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. dense: boolean, optional (default=True) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 5e055fb..2825ba2 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -105,7 +105,33 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v, str metric='sqeuclidean'): r""" - Roro's stuff + Solves the Earth Movers distance problem between sorted 1d measures and + returns the OT matrix and the associated cost + + Parameters + ---------- + u_weights : (ns,) ndarray, float64 + Source histogram + v_weights : (nt,) ndarray, float64 + Target histogram + u : (ns,) ndarray, float64 + Source dirac locations (on the real line) + v : (nt,) ndarray, float64 + Target dirac locations (on the real line) + metric: str, optional (default='sqeuclidean') + Metric to be used. Only strings listed in :func:`ot.dist` are accepted. + Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + + Returns + ------- + gamma: (n, ) ndarray, float64 + Values in the Optimal transportation matrix + indices: (n, 2) ndarray, int64 + Indices of the values stored in gamma for the Optimal transportation + matrix + cost + cost associated to the optimal transportation """ cdef double cost = 0. cdef int n = u_weights.shape[0] -- cgit v1.2.3 From 0a039eb07a3ca9ae3c5635cca1719428f62bf67d Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 24 Jun 2019 13:15:38 +0200 Subject: Made weight vectors optional to match scipy's wass1d API --- ot/lp/__init__.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 645ed8b..bf218d3 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -313,7 +313,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix @@ -338,10 +338,10 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 Target dirac locations (on the real line) - a : (ns,) ndarray, float64 - Source histogram (uniform weight if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when @@ -375,6 +375,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.emd_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) + >>> ot.emd_1d(x_a, x_b) array([[0. , 0.5], [0.5, 0. ]]) @@ -401,9 +404,9 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): "emd_1d should only be used with monodimensional data" # if empty array given then use uniform distributions - if len(a) == 0: + if a.ndim == 0 or len(a) == 0: a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0] - if len(b) == 0: + if b.ndim == 0 or len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] x_a_1d = x_a.reshape((-1, )) @@ -424,7 +427,7 @@ def emd_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): return G -def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss @@ -449,10 +452,10 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): Source dirac locations (on the real line) x_b : (nt,) or (ns, 1) ndarray, float64 Target dirac locations (on the real line) - a : (ns,) ndarray, float64 - Source histogram (uniform weight if empty list) - b : (nt,) ndarray, float64 - Target histogram (uniform weight if empty list) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when @@ -488,6 +491,8 @@ def emd2_1d(x_a, x_b, a, b, metric='sqeuclidean', dense=True, log=False): >>> x_b = [0., 3.] >>> ot.emd2_1d(x_a, x_b, a, b) 0.5 + >>> ot.emd2_1d(x_a, x_b) + 0.5 References ---------- -- cgit v1.2.3 From 1140141938c678d267f688dbb9106d3422d633c5 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 10:04:35 +0200 Subject: Added minkowski variants and wasserstein_1d functions --- ot/lp/__init__.py | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++--- ot/lp/emd_wrap.pyx | 10 ++- 2 files changed, 203 insertions(+), 10 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index bf218d3..719032b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -21,7 +21,7 @@ from .cvx import barycenter from ..utils import dist __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d'] + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -313,7 +313,8 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None return X -def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): +def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, + log=False): """Solves the Earth Movers distance problem between 1d measures and returns the OT matrix @@ -330,6 +331,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False - x_a and x_b are the samples - a and b are the sample weights + When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`. + Uses the algorithm detailed in [1]_ Parameters @@ -346,11 +349,14 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. Due to implementation details, this function runs faster when - dense is set to False. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. log: boolean, optional (default=False) If True, returns a dictionary containing the cost. Otherwise returns only the optimal transportation matrix. @@ -416,7 +422,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False G_sorted, indices, cost = emd_1d_sorted(a, b, x_a_1d[perm_a], x_b_1d[perm_b], - metric=metric) + metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), shape=(a.shape[0], b.shape[0])) if dense: @@ -427,7 +433,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False return G -def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False): +def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, + log=False): """Solves the Earth Movers distance problem between 1d measures and returns the loss @@ -444,6 +451,8 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals - x_a and x_b are the samples - a and b are the sample weights + When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`. + Uses the algorithm detailed in [1]_ Parameters @@ -459,7 +468,10 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` @@ -508,10 +520,185 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals """ # If we do not return G (log==False), then we should not to cast it to dense # (useless overhead) - G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, + G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p, dense=dense and log, log=True) cost = log_emd['cost'] if log: log_emd = {'G': G} return cost, log_emd - return cost \ No newline at end of file + return cost + + +def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): + """Solves the Wasserstein distance problem between 1d measures and returns + the OT matrix + + + .. math:: + \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \right)^{1/p} + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) + p: float, optional (default=1.0) + The order of the p-Wasserstein distance to be computed + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Due to implementation details, this function runs faster when + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + log: boolean, optional (default=False) + If True, returns a dictionary containing the cost. + Otherwise returns only the optimal transportation matrix. + + Returns + ------- + gamma: (ns, nt) ndarray + Optimal transportation matrix for the given parameters + log: dict + If input log is True, a dictionary containing the cost + + + Examples + -------- + + Simple example with obvious solution. The function wasserstein_1d accepts + lists and performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.wasserstein_1d(x_a, x_b, a, b) + array([[0. , 0.5], + [0.5, 0. ]]) + >>> ot.wasserstein_1d(x_a, x_b) + array([[0. , 0.5], + [0.5, 0. ]]) + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd_1d : EMD for 1d distributions + ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost + instead of the transportation matrix) + """ + if log: + G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + log['cost'] = np.power(log['cost'], 1. / p) + return G, log + return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + + +def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., + dense=True, log=False): + """Solves the Wasserstein distance problem between 1d measures and returns + the loss + + + .. math:: + \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \right)^{1/p} + + s.t. \gamma 1 = a, + \gamma^T 1= b, + \gamma\geq 0 + where : + + - x_a and x_b are the samples + - a and b are the sample weights + + Uses the algorithm detailed in [1]_ + + Parameters + ---------- + x_a : (ns,) or (ns, 1) ndarray, float64 + Source dirac locations (on the real line) + x_b : (nt,) or (ns, 1) ndarray, float64 + Target dirac locations (on the real line) + a : (ns,) ndarray, float64, optional + Source histogram (default is uniform weight) + b : (nt,) ndarray, float64, optional + Target histogram (default is uniform weight) + p: float, optional (default=1.0) + The order of the p-Wasserstein distance to be computed + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Only used if log is set to True. Due to implementation details, + this function runs faster when dense is set to False. + log: boolean, optional (default=False) + If True, returns a dictionary containing the transportation matrix. + Otherwise returns only the loss. + + Returns + ------- + loss: float + Cost associated to the optimal transportation + log: dict + If input log is True, a dictionary containing the Optimal transportation + matrix for the given parameters + + + Examples + -------- + + Simple example with obvious solution. The function wasserstein2_1d accepts + lists and performs automatic conversion to numpy arrays + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> x_a = [2., 0.] + >>> x_b = [0., 3.] + >>> ot.wasserstein2_1d(x_a, x_b, a, b) + 0.5 + >>> ot.wasserstein2_1d(x_a, x_b) + 0.5 + + References + ---------- + + .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal + Transport", 2018. + + See Also + -------- + ot.lp.emd2_1d : EMD for 1d distributions + ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the + transportation matrix instead of the cost) + """ + if log: + cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log) + cost = np.power(cost, 1. / p) + return cost, log + return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=dense, log=log), 1. / p) \ No newline at end of file diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2825ba2..7134136 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -103,7 +103,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, np.ndarray[double, ndim=1, mode="c"] v_weights, np.ndarray[double, ndim=1, mode="c"] u, np.ndarray[double, ndim=1, mode="c"] v, - str metric='sqeuclidean'): + str metric='sqeuclidean', + double p=1.): r""" Solves the Earth Movers distance problem between sorted 1d measures and returns the OT matrix and the associated cost @@ -121,7 +122,10 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, metric: str, optional (default='sqeuclidean') Metric to be used. Only strings listed in :func:`ot.dist` are accepted. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used. + `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics + are used. + p: float, optional (default=1.0) + The p-norm to apply for if metric='minkowski' Returns ------- @@ -154,6 +158,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, m_ij = (u[i] - v[j]) ** 2 elif metric == 'cityblock' or metric == 'euclidean': m_ij = abs(u[i] - v[j]) + elif metric == 'minkowski': + m_ij = abs(u[i] - v[j]) ** p else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3 From 0d333e004636f5d25edea6bb195e8e4d9a95ba98 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 10:23:32 +0200 Subject: Improved tests and docs for wasserstein_1d --- ot/__init__.py | 5 +++-- ot/lp/__init__.py | 13 ++++++------- ot/lp/emd_wrap.pyx | 3 ++- test/test_ot.py | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 10 deletions(-) (limited to 'ot/lp') diff --git a/ot/__init__.py b/ot/__init__.py index f0e526c..5bd9bb3 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -22,7 +22,7 @@ from . import smooth from . import stochastic # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .da import sinkhorn_lpl1_mm @@ -32,5 +32,6 @@ from .utils import dist, unif, tic, toc, toq __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', - 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', 'emd_1d', 'emd2_1d', + 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim'] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 719032b..76c9ec0 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -530,13 +530,13 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns + """Solves the p-Wasserstein distance problem between 1d measures and returns the OT matrix .. math:: - \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, @@ -617,15 +617,14 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): dense=dense, log=log) -def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., - dense=True, log=False): - """Solves the Wasserstein distance problem between 1d measures and returns +def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): + """Solves the p-Wasserstein distance problem between 1d measures and returns the loss .. math:: \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \right)^{1/p} + |x_a[i] - x_b[j]|^p \\right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 7134136..42b848f 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -13,6 +13,7 @@ cimport numpy as np from ..utils import dist cimport cython +cimport libc.math as math import warnings @@ -159,7 +160,7 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, elif metric == 'cityblock' or metric == 'euclidean': m_ij = abs(u[i] - v[j]) elif metric == 'minkowski': - m_ij = abs(u[i] - v[j]) ** p + m_ij = math.pow(abs(u[i] - v[j]), p) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] diff --git a/test/test_ot.py b/test/test_ot.py index 6d6ea26..48423e7 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -85,6 +85,29 @@ def test_emd_1d_emd2_1d(): np.testing.assert_raises(AssertionError, ot.emd_1d, u, v, [], []) +def test_wass_1d(): + # test emd1d gives similar results as emd + n = 20 + m = 30 + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd([], [], M, log=True) + wass = log["cost"] + + G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) + wass1d = log["cost"] + + # check loss is similar + np.testing.assert_allclose(np.sqrt(wass), wass1d) + + # check G is similar + np.testing.assert_allclose(G, G_1d) + + def test_emd_empty(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From c92e595009ad5e2ae6d4b2c040556cffb6316847 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 11:08:15 +0200 Subject: Wasserstein defined as the cost itself (do not return transportation matrix) --- ot/__init__.py | 4 +- ot/lp/__init__.py | 125 +++++------------------------------------------------- test/test_ot.py | 6 +-- 3 files changed, 13 insertions(+), 122 deletions(-) (limited to 'ot/lp') diff --git a/ot/__init__.py b/ot/__init__.py index 730aa4f..1b3c2fb 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -23,7 +23,7 @@ from . import stochastic from . import unbalanced # OT functions -from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d, wasserstein2_1d +from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d from .bregman import sinkhorn, sinkhorn2, barycenter from .unbalanced import sinkhorn_unbalanced, barycenter_unbalanced from .da import sinkhorn_lpl1_mm @@ -35,6 +35,6 @@ __version__ = "0.5.1" __all__ = ["emd", "emd2", 'emd_1d', "sinkhorn", "sinkhorn2", "utils", 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d', + 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'dist', 'unif', 'barycenter', 'sinkhorn_lpl1_mm', 'da', 'optim', 'sinkhorn_unbalanced', "barycenter_unbalanced"] diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 76c9ec0..a3f5b8d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -21,7 +21,7 @@ from .cvx import barycenter from ..utils import dist __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d'] + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] def emd(a, b, M, numItermax=100000, log=False): @@ -529,9 +529,9 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, return cost -def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): +def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.): """Solves the p-Wasserstein distance problem between 1d measures and returns - the OT matrix + the distance .. math:: @@ -560,22 +560,11 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): Target histogram (default is uniform weight) p: float, optional (default=1.0) The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Due to implementation details, this function runs faster when - `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics - are used. - log: boolean, optional (default=False) - If True, returns a dictionary containing the cost. - Otherwise returns only the optimal transportation matrix. Returns ------- - gamma: (ns, nt) ndarray - Optimal transportation matrix for the given parameters - log: dict - If input log is True, a dictionary containing the cost + dist: float + p-Wasserstein distance Examples @@ -590,96 +579,8 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.wasserstein_1d(x_a, x_b, a, b) - array([[0. , 0.5], - [0.5, 0. ]]) - >>> ot.wasserstein_1d(x_a, x_b) - array([[0. , 0.5], - [0.5, 0. ]]) - - References - ---------- - - .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal - Transport", 2018. - - See Also - -------- - ot.lp.emd_1d : EMD for 1d distributions - ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost - instead of the transportation matrix) - """ - if log: - G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - log['cost'] = np.power(log['cost'], 1. / p) - return G, log - return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - - -def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): - """Solves the p-Wasserstein distance problem between 1d measures and returns - the loss - - - .. math:: - \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \\right)^{1/p} - - s.t. \gamma 1 = a, - \gamma^T 1= b, - \gamma\geq 0 - where : - - - x_a and x_b are the samples - - a and b are the sample weights - - Uses the algorithm detailed in [1]_ - - Parameters - ---------- - x_a : (ns,) or (ns, 1) ndarray, float64 - Source dirac locations (on the real line) - x_b : (nt,) or (ns, 1) ndarray, float64 - Target dirac locations (on the real line) - a : (ns,) ndarray, float64, optional - Source histogram (default is uniform weight) - b : (nt,) ndarray, float64, optional - Target histogram (default is uniform weight) - p: float, optional (default=1.0) - The order of the p-Wasserstein distance to be computed - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. Only used if log is set to True. Due to implementation details, - this function runs faster when dense is set to False. - log: boolean, optional (default=False) - If True, returns a dictionary containing the transportation matrix. - Otherwise returns only the loss. - - Returns - ------- - loss: float - Cost associated to the optimal transportation - log: dict - If input log is True, a dictionary containing the Optimal transportation - matrix for the given parameters - - - Examples - -------- - - Simple example with obvious solution. The function wasserstein2_1d accepts - lists and performs automatic conversion to numpy arrays - - >>> import ot - >>> a=[.5, .5] - >>> b=[.5, .5] - >>> x_a = [2., 0.] - >>> x_b = [0., 3.] - >>> ot.wasserstein2_1d(x_a, x_b, a, b) 0.5 - >>> ot.wasserstein2_1d(x_a, x_b) + >>> ot.wasserstein_1d(x_a, x_b) 0.5 References @@ -690,14 +591,8 @@ def wasserstein2_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False): See Also -------- - ot.lp.emd2_1d : EMD for 1d distributions - ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the - transportation matrix instead of the cost) + ot.lp.emd_1d : EMD for 1d distributions """ - if log: - cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log) - cost = np.power(cost, 1. / p) - return cost, log - return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, - dense=dense, log=log), 1. / p) \ No newline at end of file + cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p, + dense=False, log=False) + return np.power(cost_emd, 1. / p) diff --git a/test/test_ot.py b/test/test_ot.py index 48423e7..3c4ac11 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -98,15 +98,11 @@ def test_wass_1d(): G, log = ot.emd([], [], M, log=True) wass = log["cost"] - G_1d, log = ot.wasserstein_1d(u, v, [], [], p=2., log=True) - wass1d = log["cost"] + wass1d = ot.wasserstein_1d(u, v, [], [], p=2.) # check loss is similar np.testing.assert_allclose(np.sqrt(wass), wass1d) - # check G is similar - np.testing.assert_allclose(G, G_1d) - def test_emd_empty(): # test emd and emd2 for simple identity -- cgit v1.2.3 From 362a7f8fa20cf7ae6f2e36d7e47c7ca9f81d3c51 Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Thu, 27 Jun 2019 13:29:19 +0200 Subject: Added RT as a contributor + "optimized" Cython math operations --- README.md | 1 + ot/lp/emd_wrap.pyx | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'ot/lp') diff --git a/README.md b/README.md index d24d8b9..84148f8 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,7 @@ The contributors to this library are: * [Alain Rakotomamonjy](https://sites.google.com/site/alainrakotomamonjy/home) * [Vayer Titouan](https://tvayer.github.io/) * [Hicham Janati](https://hichamjanati.github.io/) (Unbalanced OT) +* [Romain Tavenard](https://rtavenar.github.io/) (1d Wasserstein) This toolbox benefit a lot from open source research and we would like to thank the following persons for providing some code (in various languages): diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 42b848f..8a4aec9 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -156,11 +156,11 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights, cdef int cur_idx = 0 while i < n and j < m: if metric == 'sqeuclidean': - m_ij = (u[i] - v[j]) ** 2 + m_ij = (u[i] - v[j]) * (u[i] - v[j]) elif metric == 'cityblock' or metric == 'euclidean': - m_ij = abs(u[i] - v[j]) + m_ij = math.fabs(u[i] - v[j]) elif metric == 'minkowski': - m_ij = math.pow(abs(u[i] - v[j]), p) + m_ij = math.pow(math.fabs(u[i] - v[j]), p) else: m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)), metric=metric)[0, 0] -- cgit v1.2.3 From 3a84263ffe4bbf2ac055dfd5a84e1b65c14f9cda Mon Sep 17 00:00:00 2001 From: Romain Tavenard Date: Mon, 1 Jul 2019 14:23:14 +0200 Subject: Set numpy array formatting version to post-1.13 --- .travis.yml | 2 ++ ot/bregman.py | 86 +++++++++++++++++++++++++++---------------------------- ot/lp/__init__.py | 22 +++++++------- ot/unbalanced.py | 10 +++---- 4 files changed, 61 insertions(+), 59 deletions(-) (limited to 'ot/lp') diff --git a/.travis.yml b/.travis.yml index da68c96..f004a32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,6 +26,8 @@ before_script: # configure a headless display to test plot generation # command to install dependencies install: - pip install -r requirements.txt + - pip install numpy>=1.14 # for numpy array formatting in doctests + - pip install scipy<1.3 # otherwise, pymanopt fails, cf - pip install flake8 pytest "pytest-cov<2.6" - pip install . # command to run tests + check syntax style diff --git a/ot/bregman.py b/ot/bregman.py index 8225967..caf4024 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -16,7 +16,7 @@ from .utils import unif, dist def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - u""" + r""" Solve the entropic regularization optimal transport problem and return the OT matrix The function solves the following optimization problem: @@ -73,12 +73,12 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, -------- >>> import ot - >>> a=[.5,.5] - >>> b=[.5,.5] - >>> M=[[0.,1.],[1.,0.]] - >>> ot.sinkhorn(a,b,M,1) - array([[ 0.36552929, 0.13447071], - [ 0.13447071, 0.36552929]]) + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.], [1., 0.]] + >>> ot.sinkhorn(a, b, M, 1) + array([[0.36552929, 0.13447071], + [0.13447071, 0.36552929]]) References @@ -131,7 +131,7 @@ def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - u""" + r""" Solve the entropic regularization optimal transport problem and return the loss The function solves the following optimization problem: @@ -188,11 +188,11 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, -------- >>> import ot - >>> a=[.5,.5] - >>> b=[.5,.5] - >>> M=[[0.,1.],[1.,0.]] - >>> ot.sinkhorn2(a,b,M,1) - array([ 0.26894142]) + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.], [1., 0.]] + >>> ot.sinkhorn2(a, b, M, 1) + array([0.26894142]) @@ -248,7 +248,7 @@ def sinkhorn2(a, b, M, reg, method='sinkhorn', numItermax=1000, def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - """ + r""" Solve the entropic regularization optimal transport problem and return the OT matrix The function solves the following optimization problem: @@ -302,12 +302,12 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, -------- >>> import ot - >>> a=[.5,.5] - >>> b=[.5,.5] - >>> M=[[0.,1.],[1.,0.]] - >>> ot.sinkhorn(a,b,M,1) - array([[ 0.36552929, 0.13447071], - [ 0.13447071, 0.36552929]]) + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.], [1., 0.]] + >>> ot.sinkhorn(a, b, M, 1) + array([[0.36552929, 0.13447071], + [0.13447071, 0.36552929]]) References @@ -422,7 +422,7 @@ def sinkhorn_knopp(a, b, M, reg, numItermax=1000, def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log=False): - """ + r""" Solve the entropic regularization optimal transport problem and return the OT matrix The algorithm used is based on the paper @@ -481,12 +481,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log= -------- >>> import ot - >>> a=[.5,.5] - >>> b=[.5,.5] - >>> M=[[0.,1.],[1.,0.]] - >>> ot.bregman.greenkhorn(a,b,M,1) - array([[ 0.36552929, 0.13447071], - [ 0.13447071, 0.36552929]]) + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.], [1., 0.]] + >>> ot.bregman.greenkhorn(a, b, M, 1) + array([[0.36552929, 0.13447071], + [0.13447071, 0.36552929]]) References @@ -576,7 +576,7 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, log= def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=20, log=False, **kwargs): - """ + r""" Solve the entropic regularization OT problem with log stabilization The function solves the following optimization problem: @@ -639,8 +639,8 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, >>> b=[.5,.5] >>> M=[[0.,1.],[1.,0.]] >>> ot.bregman.sinkhorn_stabilized(a,b,M,1) - array([[ 0.36552929, 0.13447071], - [ 0.13447071, 0.36552929]]) + array([[0.36552929, 0.13447071], + [0.13447071, 0.36552929]]) References @@ -796,7 +796,7 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInnerItermax=100, tau=1e3, stopThr=1e-9, warmstart=None, verbose=False, print_period=10, log=False, **kwargs): - """ + r""" Solve the entropic regularization optimal transport problem with log stabilization and epsilon scaling. @@ -862,12 +862,12 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, numInne -------- >>> import ot - >>> a=[.5,.5] - >>> b=[.5,.5] - >>> M=[[0.,1.],[1.,0.]] - >>> ot.bregman.sinkhorn_epsilon_scaling(a,b,M,1) - array([[ 0.36552929, 0.13447071], - [ 0.13447071, 0.36552929]]) + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.], [1., 0.]] + >>> ot.bregman.sinkhorn_epsilon_scaling(a, b, M, 1) + array([[0.36552929, 0.13447071], + [0.13447071, 0.36552929]]) References @@ -989,7 +989,7 @@ def projC(gamma, q): def barycenter(A, M, reg, weights=None, numItermax=1000, stopThr=1e-4, verbose=False, log=False): - """Compute the entropic regularized wasserstein barycenter of distributions A + r"""Compute the entropic regularized wasserstein barycenter of distributions A The function solves the following optimization problem: @@ -1084,7 +1084,7 @@ def barycenter(A, M, reg, weights=None, numItermax=1000, def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1e-9, stabThr=1e-30, verbose=False, log=False): - """Compute the entropic regularized wasserstein barycenter of distributions A + r"""Compute the entropic regularized wasserstein barycenter of distributions A where A is a collection of 2D images. The function solves the following optimization problem: @@ -1195,7 +1195,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, stopThr=1 def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, stopThr=1e-3, verbose=False, log=False): - """ + r""" Compute the unmixing of an observation with a given dictionary using Wasserstein distance The function solve the following optimization problem: @@ -1302,7 +1302,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): - ''' + r''' Solve the entropic regularization optimal transport problem and return the OT matrix from empirical data @@ -1391,7 +1391,7 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numI def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): - ''' + r''' Solve the entropic regularization optimal transport problem from empirical data and return the OT loss @@ -1480,7 +1480,7 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): - ''' + r''' Compute the sinkhorn divergence loss from empirical data The function solves the following optimization problems and return the diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index a3f5b8d..8ec286b 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -25,7 +25,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', def emd(a, b, M, numItermax=100000, log=False): - """Solves the Earth Movers distance problem and returns the OT matrix + r"""Solves the Earth Movers distance problem and returns the OT matrix .. math:: @@ -76,8 +76,8 @@ def emd(a, b, M, numItermax=100000, log=False): >>> b=[.5,.5] >>> M=[[0.,1.],[1.,0.]] >>> ot.emd(a,b,M) - array([[ 0.5, 0. ], - [ 0. , 0.5]]) + array([[0.5, 0. ], + [0. , 0.5]]) References ---------- @@ -117,7 +117,7 @@ def emd(a, b, M, numItermax=100000, log=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), numItermax=100000, log=False, return_matrix=False): - """Solves the Earth Movers distance problem and returns the loss + r"""Solves the Earth Movers distance problem and returns the loss .. math:: \gamma = arg\min_\gamma <\gamma,M>_F @@ -315,7 +315,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, log=False): - """Solves the Earth Movers distance problem between 1d measures and returns + r"""Solves the Earth Movers distance problem between 1d measures and returns the OT matrix @@ -381,11 +381,11 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, >>> x_a = [2., 0.] >>> x_b = [0., 3.] >>> ot.emd_1d(x_a, x_b, a, b) - array([[0. , 0.5], - [0.5, 0. ]]) + array([[0. , 0.5], + [0.5, 0. ]]) >>> ot.emd_1d(x_a, x_b) - array([[0. , 0.5], - [0.5, 0. ]]) + array([[0. , 0.5], + [0.5, 0. ]]) References ---------- @@ -435,7 +435,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, log=False): - """Solves the Earth Movers distance problem between 1d measures and returns + r"""Solves the Earth Movers distance problem between 1d measures and returns the loss @@ -530,7 +530,7 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.): - """Solves the p-Wasserstein distance problem between 1d measures and returns + r"""Solves the p-Wasserstein distance problem between 1d measures and returns the distance diff --git a/ot/unbalanced.py b/ot/unbalanced.py index 484ce95..b2b7b10 100644 --- a/ot/unbalanced.py +++ b/ot/unbalanced.py @@ -13,7 +13,7 @@ import numpy as np def sinkhorn_unbalanced(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - u""" + r""" Solve the unbalanced entropic regularization optimal transport problem and return the loss The function solves the following optimization problem: @@ -75,7 +75,7 @@ def sinkhorn_unbalanced(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000, >>> M=[[0., 1.], [1., 0.]] >>> ot.sinkhorn_unbalanced(a, b, M, 1, 1) array([[0.51122823, 0.18807035], - [0.18807035, 0.51122823]]) + [0.18807035, 0.51122823]]) References @@ -122,7 +122,7 @@ def sinkhorn_unbalanced(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000, def sinkhorn_unbalanced2(a, b, M, reg, alpha, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - u""" + r""" Solve the entropic regularization unbalanced optimal transport problem and return the loss The function solves the following optimization problem: @@ -233,7 +233,7 @@ def sinkhorn_unbalanced2(a, b, M, reg, alpha, method='sinkhorn', def sinkhorn_knopp_unbalanced(a, b, M, reg, alpha, numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): - """ + r""" Solve the entropic regularization unbalanced optimal transport problem and return the loss The function solves the following optimization problem: @@ -401,7 +401,7 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, alpha, numItermax=1000, def barycenter_unbalanced(A, M, reg, alpha, weights=None, numItermax=1000, stopThr=1e-4, verbose=False, log=False): - """Compute the entropic regularized unbalanced wasserstein barycenter of distributions A + r"""Compute the entropic regularized unbalanced wasserstein barycenter of distributions A The function solves the following optimization problem: -- cgit v1.2.3 From 7402d344240ce94e33c53daff419d4356278d48f Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 3 Jul 2019 14:11:16 +0200 Subject: doc in modules --- ot/__init__.py | 29 ++++++++++++++++++++++++++++- ot/dr.py | 6 ++++++ ot/gpu/__init__.py | 4 ++-- ot/lp/__init__.py | 5 ++--- ot/plot.py | 6 ++++++ ot/stochastic.py | 6 ++++++ ot/utils.py | 2 +- 7 files changed, 51 insertions(+), 7 deletions(-) (limited to 'ot/lp') diff --git a/ot/__init__.py b/ot/__init__.py index 35d2ddd..35ae6fc 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -1,7 +1,33 @@ """ This is the main module of the POT toolbox. It provides easy access to -a number of functions described below. +a number of sub-modules and functions described below. + +.. note:: + + + Here is a list of the submodules and short description of what they contain. + + - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems. + - :any:`ot.bregman` contains OT solvers for the entropic OT problems using + Bregman projections. + - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems. + - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT + problems. + - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov + Wasserstein problems. + - :any:`ot.optim` contains generic solvers OT based optimization problems + - :any:`ot.da` contains classes and function related to Monge mapping + estimation and Domain Adaptation (DA). + - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers + - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein + Discriminant Analysis. + - :any:`ot.utils` contains utility functions such as distance computation and + timing. + - :any:`ot.datasets` contains toy dataset generation functions. + - :any:`ot.plot` contains visualization functions + - :any:`ot.stochastic` contains stochastic solvers for regularized OT. + - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT. .. warning:: The list of automatically imported sub-modules is as follows: @@ -14,6 +40,7 @@ a number of functions described below. - :any:`ot.dr` : depends on :code:`pymanopt` and :code:`autograd`. - :any:`ot.gpu` : depends on :code:`cupy` and a CUDA GPU. + - :any:`ot.plot` : depends on :code:`matplotlib` """ diff --git a/ot/dr.py b/ot/dr.py index d30ab30..d2bf6e2 100644 --- a/ot/dr.py +++ b/ot/dr.py @@ -1,6 +1,12 @@ # -*- coding: utf-8 -*- """ Dimension reduction with optimal transport + + +.. warning:: + Note that by default the module is not import in :mod:`ot`. In order to + use it you need to explicitely import :mod:`ot.dr` + """ # Author: Remi Flamary diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py index 6a2afcf..1ab95bb 100644 --- a/ot/gpu/__init__.py +++ b/ot/gpu/__init__.py @@ -6,8 +6,8 @@ functions. The GPU backend in handled by `cupy `_. .. warning:: - Note that by default the module is not import in :mod:`ot`. in order to - use it you need to import :mod:`ot.gpu` . + Note that by default the module is not import in :mod:`ot`. In order to + use it you need to explicitely import :mod:`ot.gpu` . By default, the functions in this module accept and return numpy arrays in order to proide drop-in replacement for the other POT function but diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index aed29f8..17f1731 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -544,14 +544,13 @@ def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.): r"""Solves the p-Wasserstein distance problem between 1d measures and returns the distance - .. math:: - \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij} - |x_a[i] - x_b[j]|^p \\right)^{1/p} + \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p} s.t. \gamma 1 = a, \gamma^T 1= b, \gamma\geq 0 + where : - x_a and x_b are the samples diff --git a/ot/plot.py b/ot/plot.py index 784a372..a409d4a 100644 --- a/ot/plot.py +++ b/ot/plot.py @@ -1,5 +1,11 @@ """ Functions for plotting OT matrices + +.. warning:: + Note that by default the module is not import in :mod:`ot`. In order to + use it you need to explicitely import :mod:`ot.plot` + + """ # Author: Remi Flamary diff --git a/ot/stochastic.py b/ot/stochastic.py index bf3e7a7..5754968 100644 --- a/ot/stochastic.py +++ b/ot/stochastic.py @@ -1,3 +1,9 @@ +""" +Stochastic solvers for regularized OT. + + +""" + # Author: Kilian Fatras # # License: MIT License diff --git a/ot/utils.py b/ot/utils.py index f21ceb9..e8249ef 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Various function that can be usefull +Various useful functions """ # Author: Remi Flamary -- cgit v1.2.3 From 7ac1b462d23ae0a396742bba4773e146e60e7502 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Fri, 5 Jul 2019 13:47:43 +0200 Subject: cleanup parmap on windows --- .travis.yml | 51 ++++++++++++++++++++++++++------------------------- ot/lp/__init__.py | 14 ++++++++++++-- ot/utils.py | 31 ++++++++++++++++++------------- requirements.txt | 1 + 4 files changed, 57 insertions(+), 40 deletions(-) (limited to 'ot/lp') diff --git a/.travis.yml b/.travis.yml index 67f0c43..cddf0e0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,31 +1,32 @@ dist: xenial # required for Python >= 3.7 language: python matrix: -# allow_failures: -# - os: osx - include: -# - os: osx -# language: generic - - os: linux - sudo: required - python: 3.4 - - os: linux - sudo: required - python: 3.5 - - os: linux - sudo: required - python: 3.6 - - os: linux - sudo: required - python: 3.7 - - os: linux - sudo: required - python: 2.7 - - name: "Python 3.7.3 on Windows" - os: windows # Windows 10.0.17134 N/A Build 17134 - language: shell # 'language: python' is an error on Travis CI Windows - before_install: choco install python - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH + allow_failures: + - os: osx + - os: windows + include: + - os: osx + language: generic + - os: linux + sudo: required + python: 3.4 + - os: linux + sudo: required + python: 3.5 + - os: linux + sudo: required + python: 3.6 + - os: linux + sudo: required + python: 3.7 + - os: linux + sudo: required + python: 2.7 + - name: "Python 3.7.3 on Windows" + os: windows # Windows 10.0.17134 N/A Build 17134 + language: shell # 'language: python' is an error on Travis CI Windows + before_install: choco install python + env: PATH=/c/Python37:/c/Python37/Scripts:$PATH before_install: - ./.travis/before_install.sh # command to install dependencies diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 17f1731..0c92810 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -11,7 +11,7 @@ Solvers for the original linear program OT problem # License: MIT License import multiprocessing - +import sys import numpy as np from scipy.sparse import coo_matrix @@ -151,6 +151,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), Target histogram (uniform weight if empty list) M : (ns,nt) numpy.ndarray, float64 Loss matrix (c-order array with type float64) + processes : int, optional (default=nb cpu) + Nb of processes used for multiple emd computation (not used on windows) numItermax : int, optional (default=100000) The maximum number of iterations before stopping the optimization algorithm if it has not converged. @@ -200,6 +202,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + # problem with pikling Forks + if sys.platform.endswith('win32'): + processes=1 + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -228,7 +234,11 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return f(b) nb = b.shape[1] - res = parmap(f, [b[:, i] for i in range(nb)], processes) + if processes>1: + res = parmap(f, [b[:, i] for i in range(nb)], processes) + else: + res = list(map(f, [b[:, i].copy() for i in range(nb)])) + return res diff --git a/ot/utils.py b/ot/utils.py index e8249ef..5707d9b 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -214,23 +214,28 @@ def fun(f, q_in, q_out): def parmap(f, X, nprocs=multiprocessing.cpu_count()): - """ paralell map for multiprocessing """ - q_in = multiprocessing.Queue(1) - q_out = multiprocessing.Queue() + """ paralell map for multiprocessing (only map on windows)""" - proc = [multiprocessing.Process(target=fun, args=(f, q_in, q_out)) - for _ in range(nprocs)] - for p in proc: - p.daemon = True - p.start() + if not sys.platform.endswith('win32'): - sent = [q_in.put((i, x)) for i, x in enumerate(X)] - [q_in.put((None, None)) for _ in range(nprocs)] - res = [q_out.get() for _ in range(len(sent))] + q_in = multiprocessing.Queue(1) + q_out = multiprocessing.Queue() - [p.join() for p in proc] + proc = [multiprocessing.Process(target=fun, args=(f, q_in, q_out)) + for _ in range(nprocs)] + for p in proc: + p.daemon = True + p.start() - return [x for i, x in sorted(res)] + sent = [q_in.put((i, x)) for i, x in enumerate(X)] + [q_in.put((None, None)) for _ in range(nprocs)] + res = [q_out.get() for _ in range(len(sent))] + + [p.join() for p in proc] + + return [x for i, x in sorted(res)] + else: + return list(map(f, X)) def check_params(**kwargs): diff --git a/requirements.txt b/requirements.txt index 97d165b..5a3432b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ matplotlib sphinx-gallery autograd pymanopt +cvxopt pytest -- cgit v1.2.3 From e92ae6d155a6bed91c474a3e842581f09deceba3 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Fri, 29 Nov 2019 09:38:29 +0100 Subject: cleanup cpp code and annd emd with sparse Ot matrix --- ot/lp/EMD_wrapper.cpp | 95 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 20 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index fc7ca63..91110b4 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -17,18 +17,24 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) { -// beware M and C anre strored in row major C style!!! + // beware M and C anre strored in row major C style!!! int n, m, i, cur; typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); + DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - // Get the number of non zero coordinates for r and c + std::vector indI(n), indJ(m); + std::vector weights1(n), weights2(m); + Digraph di(n, m); + NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); + + // Get the number of non zero coordinates for r and c and vectors n=0; for (int i=0; i0) { - n++; + weights1[ n ] = val; + indI[n++]=i; }else if(val<0){ return INFEASIBLE; } @@ -37,42 +43,85 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, for (int i=0; i0) { - m++; + weights2[ m ] = -val; + indJ[m++]=i; }else if(val<0){ return INFEASIBLE; } } // Define the graph + net.supplyMap(&weights1[0], n, &weights2[0], m); + + // Set the cost of each edge + for (int i=0; i indI(n), indJ(m); std::vector weights1(n), weights2(m); Digraph di(n, m); NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - // Set supply and demand, don't account for 0 values (faster) - - cur=0; + // Get the number of non zero coordinates for r and c and vectors + n=0; for (int i=0; i0) { - weights1[ cur ] = val; - indI[cur++]=i; - } + weights1[ n ] = val; + indI[n++]=i; + }else if(val<0){ + return INFEASIBLE; + } } - - // Demand is actually negative supply... - - cur=0; + m=0; for (int i=0; i0) { - weights2[ cur ] = -val; - indJ[cur++]=i; - } + weights2[ m ] = -val; + indJ[m++]=i; + }else if(val<0){ + return INFEASIBLE; + } } - + // Define the graph net.supplyMap(&weights1[0], n, &weights2[0], m); // Set the cost of each edge @@ -90,14 +139,19 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) { *cost = 0; Arc a; di.first(a); + cur=0 for (; a != INVALID; di.next(a)) { int i = di.source(a); int j = di.target(a); double flow = net.flow(a); *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); - *(G+indI[i]*n2+indJ[j-n]) = flow; + + *(G+cur) = flow; + *(iG+cur) = i; + *(jG+cur) = j; *(alpha + indI[i]) = -net.potential(i); *(beta + indJ[j-n]) = net.potential(j); + cur++; } } @@ -105,3 +159,4 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, return ret; } + -- cgit v1.2.3 From df0d259ebab268517716d666ae45494b6ba998ea Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Fri, 29 Nov 2019 09:41:45 +0100 Subject: cant beleive i forgot a semicolumn --- ot/lp/EMD_wrapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 91110b4..29e2303 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -139,7 +139,7 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) { *cost = 0; Arc a; di.first(a); - cur=0 + cur=0; for (; a != INVALID; di.next(a)) { int i = di.source(a); int j = di.target(a); -- cgit v1.2.3 From 3a858dfa1f2795b22d1e2db3cfd94d1eb7831f8d Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Fri, 29 Nov 2019 09:46:35 +0100 Subject: correct bad speedup --- ot/lp/EMD_wrapper.cpp | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 29e2303..65fa80f 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -18,23 +18,17 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! - int n, m, i, cur; + int n, m, i, cur; typedef FullBipartiteDigraph Digraph; DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - std::vector indI(n), indJ(m); - std::vector weights1(n), weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Get the number of non zero coordinates for r and c and vectors + // Get the number of non zero coordinates for r and c n=0; for (int i=0; i0) { - weights1[ n ] = val; - indI[n++]=i; + n++; }else if(val<0){ return INFEASIBLE; } @@ -43,14 +37,42 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, for (int i=0; i0) { - weights2[ m ] = -val; - indJ[m++]=i; + m++; }else if(val<0){ return INFEASIBLE; } } // Define the graph + + std::vector indI(n), indJ(m); + std::vector weights1(n), weights2(m); + Digraph di(n, m); + NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); + + // Set supply and demand, don't account for 0 values (faster) + + cur=0; + for (int i=0; i0) { + weights1[ cur ] = val; + indI[cur++]=i; + } + } + + // Demand is actually negative supply... + + cur=0; + for (int i=0; i0) { + weights2[ cur ] = -val; + indJ[cur++]=i; + } + } + + net.supplyMap(&weights1[0], n, &weights2[0], m); // Set the cost of each edge @@ -147,8 +169,8 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); *(G+cur) = flow; - *(iG+cur) = i; - *(jG+cur) = j; + *(iG+cur) = indI[i]; + *(jG+cur) = indJ[j]; *(alpha + indI[i]) = -net.potential(i); *(beta + indJ[j-n]) = net.potential(j); cur++; -- cgit v1.2.3 From 4a6883e0ce2fd9f3edd374d54c4c219d876ceb76 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 09:37:54 +0100 Subject: nothing explodes and it compiles --- ot/lp/EMD.h | 4 ++++ ot/lp/EMD_wrapper.cpp | 2 +- ot/lp/__init__.py | 29 ++++++++++++++++++++++------- ot/lp/emd_wrap.pyx | 38 +++++++++++++++++++++++++++++++++----- 4 files changed, 60 insertions(+), 13 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index f42e222..bc513d2 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -32,4 +32,8 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); +int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, + long *iG, long *jG, double *G, + double* alpha, double* beta, double *cost, int maxIter); + #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 65fa80f..3ca7319 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -108,7 +108,7 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - int *iG, int *jG, double *G, + long *iG, long *jG, double *G, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! int n, m, i, cur; diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..4fec7d9 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False): +def emd(a, b, M, numItermax=100000, log=False, sparse=False): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -109,7 +109,12 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + result_code_string = check_result(result_code) if log: log = {} @@ -123,7 +128,7 @@ def emd(a, b, M, numItermax=100000, log=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, return_matrix=False): + numItermax=100000, log=False, sparse=False, return_matrix=False): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -214,19 +219,29 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): - G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) - result_code_string = check_result(resultCode) + + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + + result_code_string = check_result(result_code) log = {} if return_matrix: log['G'] = G log['u'] = u log['v'] = v log['warning'] = result_code_string - log['result_code'] = resultCode + log['result_code'] = result_code return [cost, log] else: def f(b): - G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) check_result(result_code) return cost diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2b6c495..345cb66 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -20,6 +20,9 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) + int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, + long *iG, long *jG, double *G, + double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -39,7 +42,7 @@ def check_result(result_code): @cython.boundscheck(False) @cython.wraparound(False) -def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter): +def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint sparse): """ Solves the Earth Movers distance problem and returns the optimal transport matrix @@ -82,12 +85,18 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod """ cdef int n1= M.shape[0] cdef int n2= M.shape[1] + cdef int nmax=n1+n2-1 + cdef int result_code = 0 cdef double cost=0 - cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2]) cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1) cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2) + cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0]) + + cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0) + cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int) + cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int) if not len(a): a=np.ones((n1,))/n1 @@ -95,10 +104,29 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 - # calling the function - cdef int result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) + if sparse: + + Gv=np.zeros(nmax) + iG=np.zeros(nmax,dtype=np.int) + jG=np.zeros(nmax,dtype=np.int) + + + result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, G.data, alpha.data, beta.data, &cost, max_iter) + + + return Gv, iG, jG, cost, alpha, beta, result_code + + + else: + + + G=np.zeros([n1, n2]) + + + # calling the function + result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) - return G, cost, alpha, beta, result_code + return G, cost, alpha, beta, result_code @cython.boundscheck(False) -- cgit v1.2.3 From 57321bd0172c97b77dfc8b14972c18d063b6dda8 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:13:07 +0100 Subject: add awesome sparse solver --- ot/lp/EMD_wrapper.cpp | 65 ++++++++++++++++++++++++++++++++++++--------------- ot/lp/emd_wrap.pyx | 2 +- test/test_ot.py | 20 ++++++++++++++++ 3 files changed, 67 insertions(+), 20 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 3ca7319..2aa44c1 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -111,23 +111,19 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, long *iG, long *jG, double *G, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! - int n, m, i, cur; + + // Get the number of non zero coordinates for r and c and vectors + int n, m, i, cur; typedef FullBipartiteDigraph Digraph; DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - std::vector indI(n), indJ(m); - std::vector weights1(n), weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Get the number of non zero coordinates for r and c and vectors + // Get the number of non zero coordinates for r and c n=0; for (int i=0; i0) { - weights1[ n ] = val; - indI[n++]=i; + n++; }else if(val<0){ return INFEASIBLE; } @@ -136,13 +132,41 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, for (int i=0; i0) { - weights2[ m ] = -val; - indJ[m++]=i; + m++; }else if(val<0){ return INFEASIBLE; } } + // Define the graph + + std::vector indI(n), indJ(m); + std::vector weights1(n), weights2(m); + Digraph di(n, m); + NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); + + // Set supply and demand, don't account for 0 values (faster) + + cur=0; + for (int i=0; i0) { + weights1[ cur ] = val; + indI[cur++]=i; + } + } + + // Demand is actually negative supply... + + cur=0; + for (int i=0; i0) { + weights2[ cur ] = -val; + indJ[cur++]=i; + } + } + // Define the graph net.supplyMap(&weights1[0], n, &weights2[0], m); @@ -166,14 +190,17 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, int i = di.source(a); int j = di.target(a); double flow = net.flow(a); - *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); - - *(G+cur) = flow; - *(iG+cur) = indI[i]; - *(jG+cur) = indJ[j]; - *(alpha + indI[i]) = -net.potential(i); - *(beta + indJ[j-n]) = net.potential(j); - cur++; + if (flow>0) + { + *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); + + *(G+cur) = flow; + *(iG+cur) = indI[i]; + *(jG+cur) = indJ[j-n]; + *(alpha + indI[i]) = -net.potential(i); + *(beta + indJ[j-n]) = net.potential(j); + cur++; + } } } diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 345cb66..f183995 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -111,7 +111,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod jG=np.zeros(nmax,dtype=np.int) - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, G.data, alpha.data, beta.data, &cost, max_iter) + result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, alpha.data, beta.data, &cost, max_iter) return Gv, iG, jG, cost, alpha, beta, result_code diff --git a/test/test_ot.py b/test/test_ot.py index dacae0a..4d59e12 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -118,6 +118,26 @@ def test_emd_empty(): np.testing.assert_allclose(w, 0) +def test_emd_sparse(): + + n = 100 + rng = np.random.RandomState(0) + + x = rng.randn(n, 2) + x2 = rng.randn(n, 2) + u = ot.utils.unif(n) + + M = ot.dist(x, x2) + + G = ot.emd([], [], M) + + Gs = ot.emd([], [], M, sparse=True) + + # check G is the same + np.testing.assert_allclose(G, Gs.todense()) + # check constraints + + def test_emd2_multi(): n = 500 # nb bins -- cgit v1.2.3 From a6a654de5e78dd388a793fbd26f60045b05d519c Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:31:32 +0100 Subject: proper documentation and parameter --- ot/lp/EMD.h | 2 +- ot/lp/EMD_wrapper.cpp | 3 ++- ot/lp/__init__.py | 16 ++++++++++++++-- ot/lp/emd_wrap.pyx | 10 ++++++---- test/test_ot.py | 2 +- 5 files changed, 24 insertions(+), 9 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index bc513d2..9896091 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -33,7 +33,7 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 2aa44c1..9be2cdc 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -108,7 +108,7 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! @@ -202,6 +202,7 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, cur++; } } + *nG=cur; // nb of value +1 for numpy indexing } diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 4fec7d9..d476071 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False, sparse=False): +def emd(a, b, M, numItermax=100000, log=False, dense=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -62,6 +62,10 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -103,6 +107,8 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse= not dense + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -128,7 +134,7 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, sparse=False, return_matrix=False): + numItermax=100000, log=False, dense=True, return_matrix=False): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -166,6 +172,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), variables. Otherwise returns only the optimal transportation cost. return_matrix: boolean, optional (default=False) If True, returns the optimal transportation matrix in the log. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -207,6 +217,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse=not dense + # problem with pikling Forks if sys.platform.endswith('win32'): processes=1 diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index f183995..4b6cdce 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -21,7 +21,7 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -75,7 +75,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - + sparse : bool + Returning a sparse transport matrix if set to True Returns ------- @@ -87,6 +88,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod cdef int n2= M.shape[1] cdef int nmax=n1+n2-1 cdef int result_code = 0 + cdef int nG=0 cdef double cost=0 cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1) @@ -111,10 +113,10 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod jG=np.zeros(nmax,dtype=np.int) - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, alpha.data, beta.data, &cost, max_iter) + result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, &nG, alpha.data, beta.data, &cost, max_iter) - return Gv, iG, jG, cost, alpha, beta, result_code + return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code else: diff --git a/test/test_ot.py b/test/test_ot.py index 4d59e12..7b44fd1 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -131,7 +131,7 @@ def test_emd_sparse(): G = ot.emd([], [], M) - Gs = ot.emd([], [], M, sparse=True) + Gs = ot.emd([], [], M, dense=False) # check G is the same np.testing.assert_allclose(G, Gs.todense()) -- cgit v1.2.3 From a4afee871d8e9d5db68228d1ed5bf4853eedc294 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Tue, 3 Dec 2019 15:20:16 +0100 Subject: first implemntation sparse loss --- ot/lp/EMD.h | 5 +++ ot/lp/EMD_wrapper.cpp | 78 ++++++++++++++++++++++++++++++++++++++++++ ot/lp/emd_wrap.pyx | 4 +++ ot/lp/network_simplex_simple.h | 2 +- 4 files changed, 88 insertions(+), 1 deletion(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index 9896091..fc94211 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -36,4 +36,9 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter); +int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, + long *iD, long *jD, double *D, long nD, + long *iG, long *jG, double *G, long * nG, + double* alpha, double* beta, double *cost, int maxIter); + #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 9be2cdc..28e4af2 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -210,3 +210,81 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, return ret; } +int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, + long *iD, long *jD, double *D, long nD, + long *iG, long *jG, double *G, long * nG, + double* alpha, double* beta, double *cost, int maxIter) { + // beware M and C anre strored in row major C style!!! + + // Get the number of non zero coordinates for r and c and vectors + int n, m, cur; + + typedef FullBipartiteDigraph Digraph; + DIGRAPH_TYPEDEFS(FullBipartiteDigraph); + + n=n1; + m=n2; + + + // Define the graph + + + std::vector weights2(m); + Digraph di(n, m); + NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); + + // Set supply and demand, don't account for 0 values (faster) + + + // Demand is actually negative supply... + + cur=0; + for (int i=0; i0) { + weights2[ cur ] = -val; + } + } + + // Define the graph + net.supplyMap(X, n, &weights2[0], m); + + // Set the cost of each edge + for (int k=0; k0) + { + + *(G+cur) = flow; + *(iG+cur) = i; + *(jG+cur) = j-n; + *(alpha + i) = -net.potential(i); + *(beta + j-n) = net.potential(j); + cur++; + } + } + *nG=cur; // nb of value +1 for numpy indexing + + } + + + return ret; +} + diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 4b6cdce..4e3586d 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -23,6 +23,10 @@ cdef extern from "EMD.h": int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) + int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, + long *iD, long *jD, double *D, long nD, + long *iG, long *jG, double *G, long * nG, + double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h index 7c6a4ce..498e921 100644 --- a/ot/lp/network_simplex_simple.h +++ b/ot/lp/network_simplex_simple.h @@ -686,7 +686,7 @@ namespace lemon { /// \see resetParams(), reset() ProblemType run() { #if DEBUG_LVL>0 - std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n"; + std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ; #endif if (!init()) return INFEASIBLE; -- cgit v1.2.3 From 92233f79e098f1930248d815e66c0a929508af59 Mon Sep 17 00:00:00 2001 From: Kilian Date: Mon, 9 Dec 2019 15:56:48 +0100 Subject: add assert for emd dimension mismatch --- ot/lp/__init__.py | 6 ++++++ test/test_ot.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..f77c3d7 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -109,6 +109,9 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) result_code_string = check_result(result_code) if log: @@ -212,6 +215,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + if log or return_matrix: def f(b): G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) diff --git a/test/test_ot.py b/test/test_ot.py index dacae0a..1343604 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -14,6 +14,22 @@ from ot.datasets import make_1D_gauss as gauss import pytest +def test_emd_dimension_mismatch(): + # test emd and emd2 for simple identity + n_samples = 100 + n_features = 2 + rng = np.random.RandomState(0) + + x = rng.randn(n_samples, n_features) + a = ot.utils.unif(n_samples + 1) + + M = ot.dist(x, x) + + np.testing.assert_raises(AssertionError, emd, a, a, M) + + np.testing.assert_raises(AssertionError, emd2, a, a, M) + + def test_emd_emd2(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From a9bbc2cfdffd22ceee3256102e470df6c25338f3 Mon Sep 17 00:00:00 2001 From: Kilian Date: Tue, 10 Dec 2019 11:23:50 +0100 Subject: change or in assert by and --- ot/lp/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index f77c3d7..4cce41c 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -109,7 +109,7 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" G, cost, u, v, result_code = emd_c(a, b, M, numItermax) @@ -215,7 +215,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" if log or return_matrix: -- cgit v1.2.3 From 3cb03158c42dde141d6f33973ea6e3394b9dc3d4 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 18 Dec 2019 10:15:30 +0100 Subject: cleanup variable name dense --- ot/lp/__init__.py | 30 ++++++++++++++---------------- ot/lp/emd_wrap.pyx | 26 +++++++++++++------------- 2 files changed, 27 insertions(+), 29 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index d476071..bb9829a 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -107,7 +107,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - sparse= not dense # if empty array given then use uniform distributions if len(a) == 0: @@ -115,11 +114,11 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) if log: @@ -217,8 +216,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - sparse=not dense - # problem with pikling Forks if sys.platform.endswith('win32'): processes=1 @@ -231,12 +228,11 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): - - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) log = {} @@ -249,11 +245,13 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return [cost, log] else: def f(b): - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + result_code_string = check_result(result_code) check_result(result_code) return cost diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 4e3586d..636a9e3 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -46,7 +46,7 @@ def check_result(result_code): @cython.boundscheck(False) @cython.wraparound(False) -def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint sparse): +def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): """ Solves the Earth Movers distance problem and returns the optimal transport matrix @@ -110,8 +110,19 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 - if sparse: + if dense: + # init OT matrix + G=np.zeros([n1, n2]) + + # calling the function + result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) + + return G, cost, alpha, beta, result_code + + else: + + # init sparse OT matrix Gv=np.zeros(nmax) iG=np.zeros(nmax,dtype=np.int) jG=np.zeros(nmax,dtype=np.int) @@ -123,17 +134,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code - else: - - - G=np.zeros([n1, n2]) - - - # calling the function - result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) - - return G, cost, alpha, beta, result_code - @cython.boundscheck(False) @cython.wraparound(False) -- cgit v1.2.3 From fc6afbd19e75e10b539062e012583c283750d9f6 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 18 Dec 2019 10:48:58 +0100 Subject: correct documentation in pyx file --- ot/lp/emd_wrap.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 636a9e3..3220d12 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -79,8 +79,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - sparse : bool - Returning a sparse transport matrix if set to True + dense : bool + Return a sparse transport matrix if set to False Returns ------- -- cgit v1.2.3 From e9954bbd959be41c59136cf921fbf094b127eb4e Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 18 Dec 2019 12:56:55 +0100 Subject: cleanup emd.h and pyx file --- ot/lp/EMD.h | 4 ---- ot/lp/emd_wrap.pyx | 4 ---- 2 files changed, 8 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index fc94211..2adaace 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -36,9 +36,5 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter); -int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, - long *iD, long *jD, double *D, long nD, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 3220d12..c0d7128 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -23,10 +23,6 @@ cdef extern from "EMD.h": int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) - int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, - long *iD, long *jD, double *D, long nD, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED -- cgit v1.2.3 From e5196fa7a8c493b831fd5dac52a89bbf29e7b0e6 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 10:40:05 +0100 Subject: correct bug in emd emd2 still todo --- ot/lp/__init__.py | 194 +++++++++++++++++++++++++++++++++++++++++++++++------ ot/lp/emd_wrap.pyx | 2 + 2 files changed, 174 insertions(+), 22 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index eabdd3a..a771ce4 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -23,11 +23,150 @@ from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d'] +__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False, dense=True): +def center_ot_dual(alpha0, beta0, a=None, b=None): + r"""Center dual OT potentials wrt theirs weights + + The main idea of this function is to find unique dual potentials + that ensure some kind of centering/fairness. It will help have + stability when multiple calling of the OT solver with small changes. + + Basically we add another constraint to the potential that will not + change the objective value but will ensure unicity. The constraint + is the following: + + .. math:: + \alpha^T a= \beta^T b + + in addition to the OT problem constraints. + + since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing + a constant from both :math:`\alpha_0` and :math:`\beta_0`. + + .. math:: + c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta} + + \alpha=\alpha_0+c + + \beta=\beta0+c + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source centered dual potential + beta : (nt,) numpy.ndarray, float64 + Target centered dual potential + + """ + # if no weights are provided, use uniform + if a is None: + a = np.ones(alpha0.shape[0]) / alpha0.shape[0] + if b is None: + b = np.ones(beta0.shape[0]) / beta0.shape[0] + + # compute constant that balances the weighted sums of the duals + c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum()) + + # update duals + alpha = alpha0 + c + beta = beta0 - c + + return alpha, beta + + +def estimate_dual_null_weights(alpha0, beta0, a, b, M): + r"""Estimate feasible values for 0-weighted dual potentials + + The feasible values are computed efficiently bjt rather coarsely. + First we compute the constraints violations: + + .. math:: + V=\alpha+\beta^T-M + + Next we compute the max amount of violation per row (alpha) and + columns (beta) + + .. math:: + v^a_i=\max_j V_{i,j} + + v^b_j=\max_i V_{i,j} + + Finally we update the dual potential with 0 weights if a + constraint is violated + + .. math:: + \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0 + + \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 + + In the end the dual potential are centred using function + :ref:`center_ot_dual`. + + Note that all those updates do not change the objective value of the + solution but provide dual potential that do not violate the constraints. + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + M : (ns,nt) numpy.ndarray, float64 + Loss matrix (c-order array with type float64) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source corrected dual potential + beta : (nt,) numpy.ndarray, float64 + Target corrected dual potential + + """ + + # binary indexing of non-zeros weights + asel = a != 0 + bsel = b != 0 + + # compute dual constraints violation + Viol = alpha0[:, None] + beta0[None, :] - M + + # Compute worst violation per line and columns + aviol = np.max(Viol, 1) + bviol = np.max(Viol, 0) + + # update corrects violation of + alpha_up = -1 * ~asel * np.maximum(aviol, 0) + beta_up = -1 * ~bsel * np.maximum(bviol, 0) + + alpha = alpha0 + alpha_up + beta = beta0 + beta_up + + return center_ot_dual(alpha, beta, a, b) + + +def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -43,7 +182,7 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -66,6 +205,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -107,7 +249,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -117,11 +258,21 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" + asel = a != 0 + bsel = b != 0 + if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) result_code_string = check_result(result_code) if log: @@ -151,7 +302,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -177,7 +328,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` - format. + format. Returns ------- @@ -221,7 +372,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), # problem with pikling Forks if sys.platform.endswith('win32'): - processes=1 + processes = 1 # if empty array given then use uniform distributions if len(a) == 0: @@ -235,10 +386,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) log = {} @@ -252,10 +403,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), else: def f(b): if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) check_result(result_code) @@ -265,7 +416,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return f(b) nb = b.shape[1] - if processes>1: + if processes > 1: res = parmap(f, [b[:, i] for i in range(nb)], processes) else: res = list(map(f, [b[:, i].copy() for i in range(nb)])) @@ -273,7 +424,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return res - def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None): """ Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance) @@ -326,7 +476,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None k = X_init.shape[0] d = X_init.shape[1] if b is None: - b = np.ones((k,))/k + b = np.ones((k,)) / k if weights is None: weights = np.ones((N,)) / N @@ -337,7 +487,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None displacement_square_norm = stopThr + 1. - while ( displacement_square_norm > stopThr and iter_count < numItermax ): + while (displacement_square_norm > stopThr and iter_count < numItermax): T_sum = np.zeros((k, d)) @@ -347,7 +497,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None T_i = emd(b, measure_weights_i, M_i) T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i) - displacement_square_norm = np.sum(np.square(T_sum-X)) + displacement_square_norm = np.sum(np.square(T_sum - X)) if log: displacement_square_norms.append(displacement_square_norm) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index c0d7128..a4987f4 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -40,6 +40,8 @@ def check_result(result_code): return message + + @cython.boundscheck(False) @cython.wraparound(False) def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): -- cgit v1.2.3 From 9a9b3547837eac56349ce8df92bb5b0565daa2d6 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 10:59:58 +0100 Subject: correct emd2 and add centering for dual potentials --- ot/lp/__init__.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index a771ce4..aa3166f 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -264,6 +264,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) @@ -271,6 +274,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) @@ -287,7 +293,8 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, dense=True, return_matrix=False): + numItermax=100000, log=False, dense=True, return_matrix=False, + center_dual=True): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -329,6 +336,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -383,14 +393,23 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" + asel = a != 0 + if log or return_matrix: def f(b): + bsel = b != 0 if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) log = {} if return_matrix: @@ -402,12 +421,19 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return [cost, log] else: def f(b): + bsel = b != 0 if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) check_result(result_code) return cost -- cgit v1.2.3 From f65073faa73b36280a19ff8b9c383e66f8bdbd2b Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Thu, 30 Jan 2020 08:04:36 +0100 Subject: comlete documentation --- ot/lp/__init__.py | 30 +++++++++++++++++++----------- ot/lp/emd_wrap.pyx | 6 ++++++ test/test_ot.py | 4 ++-- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'ot/lp') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index aa3166f..cdd505d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -28,10 +28,10 @@ __all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', def center_ot_dual(alpha0, beta0, a=None, b=None): - r"""Center dual OT potentials wrt theirs weights + r"""Center dual OT potentials w.r.t. theirs weights The main idea of this function is to find unique dual potentials - that ensure some kind of centering/fairness. It will help have + that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having stability when multiple calling of the OT solver with small changes. Basically we add another constraint to the potential that will not @@ -91,7 +91,15 @@ def center_ot_dual(alpha0, beta0, a=None, b=None): def estimate_dual_null_weights(alpha0, beta0, a, b, M): r"""Estimate feasible values for 0-weighted dual potentials - The feasible values are computed efficiently bjt rather coarsely. + The feasible values are computed efficiently but rather coarsely. + + .. warning:: + This function is necessary because the C++ solver in emd_c + discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + First we compute the constraints violations: .. math:: @@ -113,11 +121,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 - In the end the dual potential are centred using function + In the end the dual potentials are centered using function :ref:`center_ot_dual`. Note that all those updates do not change the objective value of the - solution but provide dual potential that do not violate the constraints. + solution but provide dual potentials that do not violate the constraints. Parameters ---------- @@ -130,9 +138,9 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): beta0 : (nt,) numpy.ndarray, float64 Target dual potential a : (ns,) numpy.ndarray, float64 - Source histogram (uniform weight if empty list) + Source distribution (uniform weights if empty list) b : (nt,) numpy.ndarray, float64 - Target histogram (uniform weight if empty list) + Target distribution (uniform weights if empty list) M : (ns,nt) numpy.ndarray, float64 Loss matrix (c-order array with type float64) @@ -150,11 +158,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): bsel = b != 0 # compute dual constraints violation - Viol = alpha0[:, None] + beta0[None, :] - M + constraint_violation = alpha0[:, None] + beta0[None, :] - M - # Compute worst violation per line and columns - aviol = np.max(Viol, 1) - bviol = np.max(Viol, 0) + # Compute largest violation per line and columns + aviol = np.max(constraint_violation, 1) + bviol = np.max(constraint_violation, 0) # update corrects violation of alpha_up = -1 * ~asel * np.maximum(aviol, 0) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index a4987f4..d345fd4 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -66,6 +66,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod .. warning:: Note that the M matrix needs to be a C-order :py.cls:`numpy.array` + .. warning:: + The C++ solver discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + Parameters ---------- a : (ns,) numpy.ndarray, float64 diff --git a/test/test_ot.py b/test/test_ot.py index 245a107..47df946 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -338,9 +338,9 @@ def test_dual_variables(): np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost']) - viol = log['u'][:, None] + log['v'][None, :] - M + constraint_violation = log['u'][:, None] + log['v'][None, :] - M - assert viol.max() < 1e-8 + assert constraint_violation.max() < 1e-8 def check_duality_gap(a, b, M, G, u, v, cost): -- cgit v1.2.3