From 4a6883e0ce2fd9f3edd374d54c4c219d876ceb76 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 09:37:54 +0100 Subject: nothing explodes and it compiles --- ot/lp/__init__.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..4fec7d9 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False): +def emd(a, b, M, numItermax=100000, log=False, sparse=False): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -109,7 +109,12 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + result_code_string = check_result(result_code) if log: log = {} @@ -123,7 +128,7 @@ def emd(a, b, M, numItermax=100000, log=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, return_matrix=False): + numItermax=100000, log=False, sparse=False, return_matrix=False): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -214,19 +219,29 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): - G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) - result_code_string = check_result(resultCode) + + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + + result_code_string = check_result(result_code) log = {} if return_matrix: log['G'] = G log['u'] = u log['v'] = v log['warning'] = result_code_string - log['result_code'] = resultCode + log['result_code'] = result_code return [cost, log] else: def f(b): - G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + if sparse: + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + else: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) check_result(result_code) return cost -- cgit v1.2.3 From a6a654de5e78dd388a793fbd26f60045b05d519c Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:31:32 +0100 Subject: proper documentation and parameter --- ot/lp/EMD.h | 2 +- ot/lp/EMD_wrapper.cpp | 3 ++- ot/lp/__init__.py | 16 ++++++++++++++-- ot/lp/emd_wrap.pyx | 10 ++++++---- test/test_ot.py | 2 +- 5 files changed, 24 insertions(+), 9 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index bc513d2..9896091 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -33,7 +33,7 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 2aa44c1..9be2cdc 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -108,7 +108,7 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! @@ -202,6 +202,7 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, cur++; } } + *nG=cur; // nb of value +1 for numpy indexing } diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 4fec7d9..d476071 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False, sparse=False): +def emd(a, b, M, numItermax=100000, log=False, dense=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -62,6 +62,10 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -103,6 +107,8 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse= not dense + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -128,7 +134,7 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, sparse=False, return_matrix=False): + numItermax=100000, log=False, dense=True, return_matrix=False): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -166,6 +172,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), variables. Otherwise returns only the optimal transportation cost. return_matrix: boolean, optional (default=False) If True, returns the optimal transportation matrix in the log. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -207,6 +217,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse=not dense + # problem with pikling Forks if sys.platform.endswith('win32'): processes=1 diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index f183995..4b6cdce 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -21,7 +21,7 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -75,7 +75,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - + sparse : bool + Returning a sparse transport matrix if set to True Returns ------- @@ -87,6 +88,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod cdef int n2= M.shape[1] cdef int nmax=n1+n2-1 cdef int result_code = 0 + cdef int nG=0 cdef double cost=0 cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1) @@ -111,10 +113,10 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod jG=np.zeros(nmax,dtype=np.int) - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, alpha.data, beta.data, &cost, max_iter) + result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, &nG, alpha.data, beta.data, &cost, max_iter) - return Gv, iG, jG, cost, alpha, beta, result_code + return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code else: diff --git a/test/test_ot.py b/test/test_ot.py index 4d59e12..7b44fd1 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -131,7 +131,7 @@ def test_emd_sparse(): G = ot.emd([], [], M) - Gs = ot.emd([], [], M, sparse=True) + Gs = ot.emd([], [], M, dense=False) # check G is the same np.testing.assert_allclose(G, Gs.todense()) -- cgit v1.2.3 From 92233f79e098f1930248d815e66c0a929508af59 Mon Sep 17 00:00:00 2001 From: Kilian Date: Mon, 9 Dec 2019 15:56:48 +0100 Subject: add assert for emd dimension mismatch --- ot/lp/__init__.py | 6 ++++++ test/test_ot.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..f77c3d7 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -109,6 +109,9 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) result_code_string = check_result(result_code) if log: @@ -212,6 +215,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + if log or return_matrix: def f(b): G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) diff --git a/test/test_ot.py b/test/test_ot.py index dacae0a..1343604 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -14,6 +14,22 @@ from ot.datasets import make_1D_gauss as gauss import pytest +def test_emd_dimension_mismatch(): + # test emd and emd2 for simple identity + n_samples = 100 + n_features = 2 + rng = np.random.RandomState(0) + + x = rng.randn(n_samples, n_features) + a = ot.utils.unif(n_samples + 1) + + M = ot.dist(x, x) + + np.testing.assert_raises(AssertionError, emd, a, a, M) + + np.testing.assert_raises(AssertionError, emd2, a, a, M) + + def test_emd_emd2(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From a9bbc2cfdffd22ceee3256102e470df6c25338f3 Mon Sep 17 00:00:00 2001 From: Kilian Date: Tue, 10 Dec 2019 11:23:50 +0100 Subject: change or in assert by and --- ot/lp/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index f77c3d7..4cce41c 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -109,7 +109,7 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" G, cost, u, v, result_code = emd_c(a, b, M, numItermax) @@ -215,7 +215,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" if log or return_matrix: -- cgit v1.2.3 From 3cb03158c42dde141d6f33973ea6e3394b9dc3d4 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 18 Dec 2019 10:15:30 +0100 Subject: cleanup variable name dense --- ot/lp/__init__.py | 30 ++++++++++++++---------------- ot/lp/emd_wrap.pyx | 26 +++++++++++++------------- 2 files changed, 27 insertions(+), 29 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index d476071..bb9829a 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -107,7 +107,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - sparse= not dense # if empty array given then use uniform distributions if len(a) == 0: @@ -115,11 +114,11 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) if log: @@ -217,8 +216,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - sparse=not dense - # problem with pikling Forks if sys.platform.endswith('win32'): processes=1 @@ -231,12 +228,11 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): - - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) log = {} @@ -249,11 +245,13 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return [cost, log] else: def f(b): - if sparse: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if dense: + G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) else: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,sparse) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + result_code_string = check_result(result_code) check_result(result_code) return cost diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 4e3586d..636a9e3 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -46,7 +46,7 @@ def check_result(result_code): @cython.boundscheck(False) @cython.wraparound(False) -def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint sparse): +def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): """ Solves the Earth Movers distance problem and returns the optimal transport matrix @@ -110,8 +110,19 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 - if sparse: + if dense: + # init OT matrix + G=np.zeros([n1, n2]) + + # calling the function + result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) + + return G, cost, alpha, beta, result_code + + else: + + # init sparse OT matrix Gv=np.zeros(nmax) iG=np.zeros(nmax,dtype=np.int) jG=np.zeros(nmax,dtype=np.int) @@ -123,17 +134,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code - else: - - - G=np.zeros([n1, n2]) - - - # calling the function - result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) - - return G, cost, alpha, beta, result_code - @cython.boundscheck(False) @cython.wraparound(False) -- cgit v1.2.3 From e5196fa7a8c493b831fd5dac52a89bbf29e7b0e6 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 10:40:05 +0100 Subject: correct bug in emd emd2 still todo --- ot/lp/__init__.py | 194 +++++++++++++++++++++++++++++++++++++++++++++++------ ot/lp/emd_wrap.pyx | 2 + 2 files changed, 174 insertions(+), 22 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index eabdd3a..a771ce4 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -23,11 +23,150 @@ from ..utils import parmap from .cvx import barycenter from ..utils import dist -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d'] +__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False, dense=True): +def center_ot_dual(alpha0, beta0, a=None, b=None): + r"""Center dual OT potentials wrt theirs weights + + The main idea of this function is to find unique dual potentials + that ensure some kind of centering/fairness. It will help have + stability when multiple calling of the OT solver with small changes. + + Basically we add another constraint to the potential that will not + change the objective value but will ensure unicity. The constraint + is the following: + + .. math:: + \alpha^T a= \beta^T b + + in addition to the OT problem constraints. + + since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing + a constant from both :math:`\alpha_0` and :math:`\beta_0`. + + .. math:: + c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta} + + \alpha=\alpha_0+c + + \beta=\beta0+c + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source centered dual potential + beta : (nt,) numpy.ndarray, float64 + Target centered dual potential + + """ + # if no weights are provided, use uniform + if a is None: + a = np.ones(alpha0.shape[0]) / alpha0.shape[0] + if b is None: + b = np.ones(beta0.shape[0]) / beta0.shape[0] + + # compute constant that balances the weighted sums of the duals + c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum()) + + # update duals + alpha = alpha0 + c + beta = beta0 - c + + return alpha, beta + + +def estimate_dual_null_weights(alpha0, beta0, a, b, M): + r"""Estimate feasible values for 0-weighted dual potentials + + The feasible values are computed efficiently bjt rather coarsely. + First we compute the constraints violations: + + .. math:: + V=\alpha+\beta^T-M + + Next we compute the max amount of violation per row (alpha) and + columns (beta) + + .. math:: + v^a_i=\max_j V_{i,j} + + v^b_j=\max_i V_{i,j} + + Finally we update the dual potential with 0 weights if a + constraint is violated + + .. math:: + \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0 + + \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 + + In the end the dual potential are centred using function + :ref:`center_ot_dual`. + + Note that all those updates do not change the objective value of the + solution but provide dual potential that do not violate the constraints. + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + M : (ns,nt) numpy.ndarray, float64 + Loss matrix (c-order array with type float64) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source corrected dual potential + beta : (nt,) numpy.ndarray, float64 + Target corrected dual potential + + """ + + # binary indexing of non-zeros weights + asel = a != 0 + bsel = b != 0 + + # compute dual constraints violation + Viol = alpha0[:, None] + beta0[None, :] - M + + # Compute worst violation per line and columns + aviol = np.max(Viol, 1) + bviol = np.max(Viol, 0) + + # update corrects violation of + alpha_up = -1 * ~asel * np.maximum(aviol, 0) + beta_up = -1 * ~bsel * np.maximum(bviol, 0) + + alpha = alpha0 + alpha_up + beta = beta0 + beta_up + + return center_ot_dual(alpha, beta, a, b) + + +def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -43,7 +182,7 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -66,6 +205,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -107,7 +249,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) - # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -117,11 +258,21 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True): assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" + asel = a != 0 + bsel = b != 0 + if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) result_code_string = check_result(result_code) if log: @@ -151,7 +302,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -177,7 +328,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), dense: boolean, optional (default=True) If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` - format. + format. Returns ------- @@ -221,7 +372,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), # problem with pikling Forks if sys.platform.endswith('win32'): - processes=1 + processes = 1 # if empty array given then use uniform distributions if len(a) == 0: @@ -235,10 +386,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) log = {} @@ -252,10 +403,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), else: def f(b): if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) result_code_string = check_result(result_code) check_result(result_code) @@ -265,7 +416,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return f(b) nb = b.shape[1] - if processes>1: + if processes > 1: res = parmap(f, [b[:, i] for i in range(nb)], processes) else: res = list(map(f, [b[:, i].copy() for i in range(nb)])) @@ -273,7 +424,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return res - def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None): """ Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance) @@ -326,7 +476,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None k = X_init.shape[0] d = X_init.shape[1] if b is None: - b = np.ones((k,))/k + b = np.ones((k,)) / k if weights is None: weights = np.ones((N,)) / N @@ -337,7 +487,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None displacement_square_norm = stopThr + 1. - while ( displacement_square_norm > stopThr and iter_count < numItermax ): + while (displacement_square_norm > stopThr and iter_count < numItermax): T_sum = np.zeros((k, d)) @@ -347,7 +497,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None T_i = emd(b, measure_weights_i, M_i) T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i) - displacement_square_norm = np.sum(np.square(T_sum-X)) + displacement_square_norm = np.sum(np.square(T_sum - X)) if log: displacement_square_norms.append(displacement_square_norm) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index c0d7128..a4987f4 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -40,6 +40,8 @@ def check_result(result_code): return message + + @cython.boundscheck(False) @cython.wraparound(False) def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): -- cgit v1.2.3 From 9a9b3547837eac56349ce8df92bb5b0565daa2d6 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 10:59:58 +0100 Subject: correct emd2 and add centering for dual potentials --- ot/lp/__init__.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index a771ce4..aa3166f 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -264,6 +264,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) @@ -271,6 +274,9 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) @@ -287,7 +293,8 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, dense=True, return_matrix=False): + numItermax=100000, log=False, dense=True, return_matrix=False, + center_dual=True): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -329,6 +336,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). Otherwise returns a sparse representation using scipy's `coo_matrix` format. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -383,14 +393,23 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ "Dimension mismatch, check dimensions of M with a and b" + asel = a != 0 + if log or return_matrix: def f(b): + bsel = b != 0 if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) log = {} if return_matrix: @@ -402,12 +421,19 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return [cost, log] else: def f(b): + bsel = b != 0 if dense: G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) else: Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) check_result(result_code) return cost -- cgit v1.2.3 From f65073faa73b36280a19ff8b9c383e66f8bdbd2b Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Thu, 30 Jan 2020 08:04:36 +0100 Subject: comlete documentation --- ot/lp/__init__.py | 30 +++++++++++++++++++----------- ot/lp/emd_wrap.pyx | 6 ++++++ test/test_ot.py | 4 ++-- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index aa3166f..cdd505d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -28,10 +28,10 @@ __all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', def center_ot_dual(alpha0, beta0, a=None, b=None): - r"""Center dual OT potentials wrt theirs weights + r"""Center dual OT potentials w.r.t. theirs weights The main idea of this function is to find unique dual potentials - that ensure some kind of centering/fairness. It will help have + that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having stability when multiple calling of the OT solver with small changes. Basically we add another constraint to the potential that will not @@ -91,7 +91,15 @@ def center_ot_dual(alpha0, beta0, a=None, b=None): def estimate_dual_null_weights(alpha0, beta0, a, b, M): r"""Estimate feasible values for 0-weighted dual potentials - The feasible values are computed efficiently bjt rather coarsely. + The feasible values are computed efficiently but rather coarsely. + + .. warning:: + This function is necessary because the C++ solver in emd_c + discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + First we compute the constraints violations: .. math:: @@ -113,11 +121,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 - In the end the dual potential are centred using function + In the end the dual potentials are centered using function :ref:`center_ot_dual`. Note that all those updates do not change the objective value of the - solution but provide dual potential that do not violate the constraints. + solution but provide dual potentials that do not violate the constraints. Parameters ---------- @@ -130,9 +138,9 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): beta0 : (nt,) numpy.ndarray, float64 Target dual potential a : (ns,) numpy.ndarray, float64 - Source histogram (uniform weight if empty list) + Source distribution (uniform weights if empty list) b : (nt,) numpy.ndarray, float64 - Target histogram (uniform weight if empty list) + Target distribution (uniform weights if empty list) M : (ns,nt) numpy.ndarray, float64 Loss matrix (c-order array with type float64) @@ -150,11 +158,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): bsel = b != 0 # compute dual constraints violation - Viol = alpha0[:, None] + beta0[None, :] - M + constraint_violation = alpha0[:, None] + beta0[None, :] - M - # Compute worst violation per line and columns - aviol = np.max(Viol, 1) - bviol = np.max(Viol, 0) + # Compute largest violation per line and columns + aviol = np.max(constraint_violation, 1) + bviol = np.max(constraint_violation, 0) # update corrects violation of alpha_up = -1 * ~asel * np.maximum(aviol, 0) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index a4987f4..d345fd4 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -66,6 +66,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod .. warning:: Note that the M matrix needs to be a C-order :py.cls:`numpy.array` + .. warning:: + The C++ solver discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + Parameters ---------- a : (ns,) numpy.ndarray, float64 diff --git a/test/test_ot.py b/test/test_ot.py index 245a107..47df946 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -338,9 +338,9 @@ def test_dual_variables(): np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost']) - viol = log['u'][:, None] + log['v'][None, :] - M + constraint_violation = log['u'][:, None] + log['v'][None, :] - M - assert viol.max() < 1e-8 + assert constraint_violation.max() < 1e-8 def check_duality_gap(a, b, M, G, u, v, cost): -- cgit v1.2.3 From 592f933085d5b521a440eb91eccc283c43732170 Mon Sep 17 00:00:00 2001 From: AdrienCorenflos Date: Wed, 1 Apr 2020 12:14:42 +0100 Subject: Fix ordering --- ot/lp/__init__.py | 2 +- test/test_ot.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index cdd505d..4c968ca 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -656,7 +656,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, perm_a = np.argsort(x_a_1d) perm_b = np.argsort(x_b_1d) - G_sorted, indices, cost = emd_1d_sorted(a, b, + G_sorted, indices, cost = emd_1d_sorted(a[perm_a.flatten()], b[perm_b.flatten()], x_a_1d[perm_a], x_b_1d[perm_b], metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), diff --git a/test/test_ot.py b/test/test_ot.py index 47df946..7afdae3 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -91,6 +91,44 @@ def test_emd_1d_emd2_1d(): with pytest.raises(AssertionError): ot.emd_1d(u, v, [], []) +def test_emd_1d_emd2_1d_with_weights(): + + # test emd1d gives similar results as emd + n = 20 + m = 30 + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) + + w_u = rng.uniform(0., 1., n) + w_u = w_u / w_u.sum() + + w_v = rng.uniform(0., 1., m) + w_v = w_v / w_v.sum() + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd(w_u, w_v, M, log=True) + wass = log["cost"] + G_1d, log = ot.emd_1d(u, v, w_u, w_v, metric='sqeuclidean', log=True) + wass1d = log["cost"] + wass1d_emd2 = ot.emd2_1d(u, v, w_u, w_v, metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d(u, v, w_u, w_v, metric='euclidean', log=False) + + # check loss is similar + np.testing.assert_allclose(wass, wass1d) + np.testing.assert_allclose(wass, wass1d_emd2) + + # check loss is similar to scipy's implementation for Euclidean metric + wass_sp = wasserstein_distance(u.reshape((-1,)), v.reshape((-1,))) + np.testing.assert_allclose(wass_sp, wass1d_euc) + + # check constraints + np.testing.assert_allclose(w_u, G.sum(1)) + np.testing.assert_allclose(w_v, G.sum(0)) + + + def test_wass_1d(): # test emd1d gives similar results as emd -- cgit v1.2.3 From 60943d00bab1682d6fac22b1e1ba5e64569b4e78 Mon Sep 17 00:00:00 2001 From: AdrienCorenflos Date: Thu, 2 Apr 2020 10:41:24 +0100 Subject: Auto PEP8 --- ot/lp/__init__.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 4c968ca..1922785 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -12,16 +12,16 @@ Solvers for the original linear program OT problem import multiprocessing import sys + import numpy as np from scipy.sparse import coo_matrix -from .import cvx - +from . import cvx +from .cvx import barycenter # import compiled emd from .emd_wrap import emd_c, check_result, emd_1d_sorted -from ..utils import parmap -from .cvx import barycenter from ..utils import dist +from ..utils import parmap __all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] @@ -458,7 +458,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return res -def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None): +def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, + stopThr=1e-7, verbose=False, log=None): """ Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance) @@ -525,8 +526,8 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None T_sum = np.zeros((k, d)) - for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, weights.tolist()): - + for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, + weights.tolist()): M_i = dist(X, measure_locations_i) T_i = emd(b, measure_weights_i, M_i) T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i) @@ -651,8 +652,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, if b.ndim == 0 or len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] - x_a_1d = x_a.reshape((-1, )) - x_b_1d = x_b.reshape((-1, )) + x_a_1d = x_a.reshape((-1,)) + x_b_1d = x_b.reshape((-1,)) perm_a = np.argsort(x_a_1d) perm_b = np.argsort(x_b_1d) -- cgit v1.2.3 From a9e69509412338920142c0615a50bc00739144d0 Mon Sep 17 00:00:00 2001 From: AdrienCorenflos Date: Thu, 2 Apr 2020 11:11:16 +0100 Subject: Remove flatten, it's not useful. --- ot/lp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 1922785..f4f6861 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -657,7 +657,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, perm_a = np.argsort(x_a_1d) perm_b = np.argsort(x_b_1d) - G_sorted, indices, cost = emd_1d_sorted(a[perm_a.flatten()], b[perm_b.flatten()], + G_sorted, indices, cost = emd_1d_sorted(a[perm_a], b[perm_b], x_a_1d[perm_a], x_b_1d[perm_b], metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), -- cgit v1.2.3 From 9200af5d795517b0772c10bb3d16022dd1a12791 Mon Sep 17 00:00:00 2001 From: ievred Date: Thu, 2 Apr 2020 15:29:12 +0200 Subject: laplace v1 --- ot/bregman.py | 72 +++++++++++++++++++++++++++++++++---------------------- ot/datasets.py | 4 ++-- ot/lp/__init__.py | 4 +--- ot/plot.py | 3 ++- 4 files changed, 49 insertions(+), 34 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/bregman.py b/ot/bregman.py index fb959e9..951d3ce 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -19,6 +19,7 @@ import warnings from .utils import unif, dist from scipy.optimize import fmin_l_bfgs_b + def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, stopThr=1e-9, verbose=False, log=False, **kwargs): r""" @@ -539,12 +540,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, old_v = v[i_2] v[i_2] = b[i_2] / (K[:, i_2].T.dot(u)) G[:, i_2] = u * K[:, i_2] * v[i_2] - #aviol = (G@one_m - a) - #aviol_2 = (G.T@one_n - b) + # aviol = (G@one_m - a) + # aviol_2 = (G.T@one_n - b) viol += (-old_v + v[i_2]) * K[:, i_2] * u viol_2[i_2] = v[i_2] * K[:, i_2].dot(u) - b[i_2] - #print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2))) + # print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2))) if stopThr_val <= stopThr: break @@ -715,7 +716,7 @@ def sinkhorn_stabilized(a, b, M, reg, numItermax=1000, tau=1e3, stopThr=1e-9, if np.abs(u).max() > tau or np.abs(v).max() > tau: if n_hists: alpha, beta = alpha + reg * \ - np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) + np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) else: alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v) if n_hists: @@ -940,7 +941,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, # the 10th iterations transp = G err = np.linalg.norm( - (np.sum(transp, axis=0) - b))**2 + np.linalg.norm((np.sum(transp, axis=1) - a))**2 + (np.sum(transp, axis=0) - b)) ** 2 + np.linalg.norm((np.sum(transp, axis=1) - a)) ** 2 if log: log['err'].append(err) @@ -966,7 +967,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, def geometricBar(weights, alldistribT): """return the weighted geometric mean of distributions""" - assert(len(weights) == alldistribT.shape[1]) + assert (len(weights) == alldistribT.shape[1]) return np.exp(np.dot(np.log(alldistribT), weights.T)) @@ -1108,7 +1109,7 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000, if weights is None: weights = np.ones(A.shape[1]) / A.shape[1] else: - assert(len(weights) == A.shape[1]) + assert (len(weights) == A.shape[1]) if log: log = {'err': []} @@ -1206,7 +1207,7 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000, if weights is None: weights = np.ones(n_hists) / n_hists else: - assert(len(weights) == A.shape[1]) + assert (len(weights) == A.shape[1]) if log: log = {'err': []} @@ -1334,7 +1335,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, if weights is None: weights = np.ones(A.shape[0]) / A.shape[0] else: - assert(len(weights) == A.shape[0]) + assert (len(weights) == A.shape[0]) if log: log = {'err': []} @@ -1350,11 +1351,11 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, # this is equivalent to blurring on horizontal then vertical directions t = np.linspace(0, 1, A.shape[1]) [Y, X] = np.meshgrid(t, t) - xi1 = np.exp(-(X - Y)**2 / reg) + xi1 = np.exp(-(X - Y) ** 2 / reg) t = np.linspace(0, 1, A.shape[2]) [Y, X] = np.meshgrid(t, t) - xi2 = np.exp(-(X - Y)**2 / reg) + xi2 = np.exp(-(X - Y) ** 2 / reg) def K(x): return np.dot(np.dot(xi1, x), xi2) @@ -1501,6 +1502,7 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, else: return np.sum(K0, axis=1) + def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100, stopThr=1e-6, verbose=False, log=False, **kwargs): r'''Joint OT and proportion estimation for multi-source target shift as proposed in [27] @@ -1658,6 +1660,7 @@ def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100, else: return couplings, bary + def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): @@ -1749,7 +1752,8 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', return pi -def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): +def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, + verbose=False, log=False, **kwargs): r''' Solve the entropic regularization optimal transport problem from empirical data and return the OT loss @@ -1831,14 +1835,17 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num M = dist(X_s, X_t, metric=metric) if log: - sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) + sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, + **kwargs) return sinkhorn_loss, log else: - sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) + sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, + **kwargs) return sinkhorn_loss -def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): +def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, + verbose=False, log=False, **kwargs): r''' Compute the sinkhorn divergence loss from empirical data @@ -1924,11 +1931,14 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences, Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018 ''' if log: - sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) - sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) - sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b) @@ -1943,11 +1953,14 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli return max(0, sinkhorn_div), log else: - sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) - sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) - sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b) return max(0, sinkhorn_div) @@ -2039,7 +2052,8 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res try: import bottleneck except ImportError: - warnings.warn("Bottleneck module is not installed. Install it from https://pypi.org/project/Bottleneck/ for better performance.") + warnings.warn( + "Bottleneck module is not installed. Install it from https://pypi.org/project/Bottleneck/ for better performance.") bottleneck = np a = np.asarray(a, dtype=np.float64) @@ -2173,10 +2187,11 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res # box constraints in L-BFGS-B (see Proposition 1 in [26]) bounds_u = [(max(a_I_min / ((nt - nt_budget) * epsilon + nt_budget * (b_J_max / ( - ns * epsilon * kappa * K_min))), epsilon / kappa), a_I_max / (nt * epsilon * K_min))] * ns_budget + ns * epsilon * kappa * K_min))), epsilon / kappa), a_I_max / (nt * epsilon * K_min))] * ns_budget - bounds_v = [(max(b_J_min / ((ns - ns_budget) * epsilon + ns_budget * (kappa * a_I_max / (nt * epsilon * K_min))), - epsilon * kappa), b_J_max / (ns * epsilon * K_min))] * nt_budget + bounds_v = [( + max(b_J_min / ((ns - ns_budget) * epsilon + ns_budget * (kappa * a_I_max / (nt * epsilon * K_min))), + epsilon * kappa), b_J_max / (ns * epsilon * K_min))] * nt_budget # pre-calculated constants for the objective vec_eps_IJc = epsilon * kappa * (K_IJc * np.ones(nt - nt_budget).reshape((1, -1))).sum(axis=1) @@ -2225,7 +2240,8 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res return usc, vsc def screened_obj(usc, vsc): - part_IJ = np.dot(np.dot(usc, K_IJ), vsc) - kappa * np.dot(a_I, np.log(usc)) - (1. / kappa) * np.dot(b_J, np.log(vsc)) + part_IJ = np.dot(np.dot(usc, K_IJ), vsc) - kappa * np.dot(a_I, np.log(usc)) - (1. / kappa) * np.dot(b_J, + np.log(vsc)) part_IJc = np.dot(usc, vec_eps_IJc) part_IcJ = np.dot(vec_eps_IcJ, vsc) psi_epsilon = part_IJ + part_IJc + part_IcJ @@ -2247,9 +2263,9 @@ def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, res g = np.hstack([g_u, g_v]) return f, g - #----------------------------------------------------------------------------------------------------------------# + # ----------------------------------------------------------------------------------------------------------------# # Step 2: L-BFGS-B solver # - #----------------------------------------------------------------------------------------------------------------# + # ----------------------------------------------------------------------------------------------------------------# u0, v0 = restricted_sinkhorn(u0, v0) theta0 = np.hstack([u0, v0]) diff --git a/ot/datasets.py b/ot/datasets.py index eea9f37..a1ca7b6 100644 --- a/ot/datasets.py +++ b/ot/datasets.py @@ -30,7 +30,7 @@ def make_1D_gauss(n, m, s): 1D histogram for a gaussian distribution """ x = np.arange(n, dtype=np.float64) - h = np.exp(-(x - m)**2 / (2 * s**2)) + h = np.exp(-(x - m) ** 2 / (2 * s ** 2)) return h / h.sum() @@ -80,7 +80,7 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None): return make_2D_samples_gauss(n, m, sigma, random_state=None) -def make_data_classif(dataset, n, nz=.5, theta=0, p = .5, random_state=None, **kwargs): +def make_data_classif(dataset, n, nz=.5, theta=0, p=.5, random_state=None, **kwargs): """Dataset generation for classification problems Parameters diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index cdd505d..7eaa44a 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -2,8 +2,6 @@ """ Solvers for the original linear program OT problem - - """ # Author: Remi Flamary @@ -18,7 +16,7 @@ from scipy.sparse import coo_matrix from .import cvx # import compiled emd -from .emd_wrap import emd_c, check_result, emd_1d_sorted +#from .emd_wrap import emd_c, check_result, emd_1d_sorted from ..utils import parmap from .cvx import barycenter from ..utils import dist diff --git a/ot/plot.py b/ot/plot.py index f403e98..ad436b4 100644 --- a/ot/plot.py +++ b/ot/plot.py @@ -78,9 +78,10 @@ def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs): thr : float, optional threshold above which the line is drawn **kwargs : dict - paameters given to the plot functions (default color is black if + parameters given to the plot functions (default color is black if nothing given) """ + if ('color' not in kwargs) and ('c' not in kwargs): kwargs['color'] = 'k' mx = G.max() -- cgit v1.2.3 From b32c81542c99cc48944fbeb13e4648f9947ac19d Mon Sep 17 00:00:00 2001 From: ievred Date: Fri, 3 Apr 2020 17:32:07 +0200 Subject: remove commented line --- ot/lp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 7eaa44a..c4b5834 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -16,7 +16,7 @@ from scipy.sparse import coo_matrix from .import cvx # import compiled emd -#from .emd_wrap import emd_c, check_result, emd_1d_sorted +from .emd_wrap import emd_c, check_result, emd_1d_sorted from ..utils import parmap from .cvx import barycenter from ..utils import dist -- cgit v1.2.3 From ef12867f1425ee86b3cfddef4287b52d46114e83 Mon Sep 17 00:00:00 2001 From: Nicolas Courty Date: Thu, 23 Apr 2020 13:03:28 +0200 Subject: [WIP] Issue with sparse emd and adding tests on macos (#158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * First commit-warning removal * remove dense feature * pep8 * pep8 * EMD.h * pep8 again * tic toc tolerance Co-authored-by: Rémi Flamary --- .github/workflows/pythonpackage.yml | 48 +++++----- ot/lp/EMD.h | 3 - ot/lp/EMD_wrapper.cpp | 182 ------------------------------------ ot/lp/__init__.py | 45 +++------ ot/lp/emd_wrap.pyx | 38 ++------ ot/lp/network_simplex_simple.h | 5 +- test/test_ot.py | 26 ------ test/test_utils.py | 4 +- 8 files changed, 46 insertions(+), 305 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 394f453..9c35afa 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -47,31 +47,31 @@ jobs: run: | codecov - # macos: - # runs-on: macOS-latest - # strategy: - # max-parallel: 4 - # matrix: - # python-version: [3.7] + macos: + runs-on: macOS-latest + strategy: + max-parallel: 4 + matrix: + python-version: [3.7] - # steps: - # - uses: actions/checkout@v1 - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v1 - # with: - # python-version: ${{ matrix.python-version }} - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install -r requirements.txt - # pip install pytest "pytest-cov<2.6" - # pip install -U "sklearn" - # - name: Install POT - # run: | - # pip install -e . - # - name: Run tests - # run: | - # python -m pytest -v test/ ot/ --doctest-modules --ignore ot/gpu/ --cov=ot + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest "pytest-cov<2.6" + pip install -U "sklearn" + - name: Install POT + run: | + pip install -e . + - name: Run tests + run: | + python -m pytest -v test/ ot/ --doctest-modules --ignore ot/gpu/ --cov=ot windows: diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index 2adaace..c0fe7a3 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -32,9 +32,6 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); -int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 28e4af2..bc873ed 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -106,185 +106,3 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, return ret; } - -int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) { - // beware M and C anre strored in row major C style!!! - - // Get the number of non zero coordinates for r and c and vectors - int n, m, i, cur; - - typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - - // Get the number of non zero coordinates for r and c - n=0; - for (int i=0; i0) { - n++; - }else if(val<0){ - return INFEASIBLE; - } - } - m=0; - for (int i=0; i0) { - m++; - }else if(val<0){ - return INFEASIBLE; - } - } - - // Define the graph - - std::vector indI(n), indJ(m); - std::vector weights1(n), weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Set supply and demand, don't account for 0 values (faster) - - cur=0; - for (int i=0; i0) { - weights1[ cur ] = val; - indI[cur++]=i; - } - } - - // Demand is actually negative supply... - - cur=0; - for (int i=0; i0) { - weights2[ cur ] = -val; - indJ[cur++]=i; - } - } - - // Define the graph - net.supplyMap(&weights1[0], n, &weights2[0], m); - - // Set the cost of each edge - for (int i=0; i0) - { - *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); - - *(G+cur) = flow; - *(iG+cur) = indI[i]; - *(jG+cur) = indJ[j-n]; - *(alpha + indI[i]) = -net.potential(i); - *(beta + indJ[j-n]) = net.potential(j); - cur++; - } - } - *nG=cur; // nb of value +1 for numpy indexing - - } - - - return ret; -} - -int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, - long *iD, long *jD, double *D, long nD, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) { - // beware M and C anre strored in row major C style!!! - - // Get the number of non zero coordinates for r and c and vectors - int n, m, cur; - - typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - - n=n1; - m=n2; - - - // Define the graph - - - std::vector weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Set supply and demand, don't account for 0 values (faster) - - - // Demand is actually negative supply... - - cur=0; - for (int i=0; i0) { - weights2[ cur ] = -val; - } - } - - // Define the graph - net.supplyMap(X, n, &weights2[0], m); - - // Set the cost of each edge - for (int k=0; k0) - { - - *(G+cur) = flow; - *(iG+cur) = i; - *(jG+cur) = j-n; - *(alpha + i) = -net.potential(i); - *(beta + j-n) = net.potential(j); - cur++; - } - } - *nG=cur; // nb of value +1 for numpy indexing - - } - - - return ret; -} - diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 8d1baa0..ad390c5 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -172,7 +172,7 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): return center_ot_dual(alpha, beta, a, b) -def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): +def emd(a, b, M, numItermax=100000, log=False, center_dual=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -207,10 +207,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. center_dual: boolean, optional (default=True) If True, centers the dual potential using function :ref:`center_ot_dual`. @@ -267,25 +263,14 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): asel = a != 0 bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - - if center_dual: - u, v = center_ot_dual(u, v, a, b) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) - if np.any(~asel) or np.any(~bsel): - u, v = estimate_dual_null_weights(u, v, a, b, M) - - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) - - if center_dual: - u, v = center_ot_dual(u, v, a, b) - - if np.any(~asel) or np.any(~bsel): - u, v = estimate_dual_null_weights(u, v, a, b, M) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) if log: log = {} @@ -299,7 +284,7 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, dense=True, return_matrix=False, + numItermax=100000, log=False, return_matrix=False, center_dual=True): r"""Solves the Earth Movers distance problem and returns the loss @@ -404,11 +389,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) if center_dual: u, v = center_ot_dual(u, v, a, b) @@ -428,11 +410,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), else: def f(b): bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) if center_dual: u, v = center_ot_dual(u, v, a, b) @@ -440,7 +418,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) - result_code_string = check_result(result_code) check_result(result_code) return cost diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index d345fd4..b6bda47 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -20,9 +20,6 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) - int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -38,13 +35,10 @@ def check_result(result_code): message = "numItermax reached before optimality. Try to increase numItermax." warnings.warn(message) return message - - - - + @cython.boundscheck(False) @cython.wraparound(False) -def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): +def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter): """ Solves the Earth Movers distance problem and returns the optimal transport matrix @@ -83,8 +77,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - dense : bool - Return a sparse transport matrix if set to False Returns ------- @@ -114,29 +106,13 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 - if dense: - # init OT matrix - G=np.zeros([n1, n2]) - - # calling the function - result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) - - return G, cost, alpha, beta, result_code - - - else: - - # init sparse OT matrix - Gv=np.zeros(nmax) - iG=np.zeros(nmax,dtype=np.int) - jG=np.zeros(nmax,dtype=np.int) - - - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, &nG, alpha.data, beta.data, &cost, max_iter) - + # init OT matrix + G=np.zeros([n1, n2]) - return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code + # calling the function + result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) + return G, cost, alpha, beta, result_code @cython.boundscheck(False) diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h index 498e921..5d93040 100644 --- a/ot/lp/network_simplex_simple.h +++ b/ot/lp/network_simplex_simple.h @@ -875,7 +875,7 @@ namespace lemon { c += Number(it->second) * Number(_cost[it->first]); return c;*/ - for (int i=0; i<_flow.size(); i++) + for (unsigned long i=0; i<_flow.size(); i++) c += _flow[i] * Number(_cost[i]); return c; @@ -1257,7 +1257,7 @@ namespace lemon { u = w; } _pred[u_in] = in_arc; - _forward[u_in] = (u_in == _source[in_arc]); + _forward[u_in] = ((unsigned int)u_in == _source[in_arc]); _succ_num[u_in] = old_succ_num; // Set limits for updating _last_succ form v_in and v_out @@ -1418,7 +1418,6 @@ namespace lemon { template ProblemType start() { PivotRuleImpl pivot(*this); - double prevCost=-1; ProblemType retVal = OPTIMAL; // Perform heuristic initial pivots diff --git a/test/test_ot.py b/test/test_ot.py index 0f1357f..b7306f6 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -170,27 +170,6 @@ def test_emd_empty(): np.testing.assert_allclose(w, 0) -def test_emd_sparse(): - n = 100 - rng = np.random.RandomState(0) - - x = rng.randn(n, 2) - x2 = rng.randn(n, 2) - - M = ot.dist(x, x2) - - G = ot.emd([], [], M, dense=True) - - Gs = ot.emd([], [], M, dense=False) - - ws = ot.emd2([], [], M, dense=False) - - # check G is the same - np.testing.assert_allclose(G, Gs.todense()) - # check value - np.testing.assert_allclose(Gs.multiply(M).sum(), ws, rtol=1e-6) - - def test_emd2_multi(): n = 500 # nb bins @@ -222,12 +201,7 @@ def test_emd2_multi(): emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') - ot.tic() - emdn2 = ot.emd2(a, b, M, dense=False) - ot.toc('multi proc : {} s') - np.testing.assert_allclose(emd1, emdn) - np.testing.assert_allclose(emd1, emdn2, rtol=1e-6) # emd loss multipro proc with log ot.tic() diff --git a/test/test_utils.py b/test/test_utils.py index 640598d..db9cda6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -36,10 +36,10 @@ def test_tic_toc(): t2 = ot.toq() # test timing - np.testing.assert_allclose(0.5, t, rtol=1e-2, atol=1e-2) + np.testing.assert_allclose(0.5, t, rtol=1e-1, atol=1e-1) # test toc vs toq - np.testing.assert_allclose(t, t2, rtol=1e-2, atol=1e-2) + np.testing.assert_allclose(t, t2, rtol=1e-1, atol=1e-1) def test_kernel(): -- cgit v1.2.3 From 8599e720d5f438e2aaf5c635883e64deb026f3ce Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Fri, 24 Apr 2020 11:20:02 +0200 Subject: correct doc for emd --- docs/source/conf.py | 2 +- ot/lp/__init__.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/docs/source/conf.py b/docs/source/conf.py index 880c71d..f3c61e7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -67,7 +67,7 @@ extensions = [ 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', - 'sphinx_gallery.gen_gallery', + #'sphinx_gallery.gen_gallery', ] napoleon_numpy_docstring = True diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index ad390c5..50003ed 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -180,7 +180,9 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True): \gamma = arg\min_\gamma <\gamma,M>_F s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 where : @@ -289,10 +291,12 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), r"""Solves the Earth Movers distance problem and returns the loss .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + \min_\gamma <\gamma,M>_F s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 where : -- cgit v1.2.3 From 8406caafaef8b3683d6a1d44917c404ba780f82c Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Tue, 5 May 2020 07:53:45 +0200 Subject: remove dense from ducumentation --- ot/lp/__init__.py | 4 ---- 1 file changed, 4 deletions(-) (limited to 'ot/lp/__init__.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 50003ed..514a607 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -327,10 +327,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), variables. Otherwise returns only the optimal transportation cost. return_matrix: boolean, optional (default=False) If True, returns the optimal transportation matrix in the log. - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. center_dual: boolean, optional (default=True) If True, centers the dual potential using function :ref:`center_ot_dual`. -- cgit v1.2.3