From 57321bd0172c97b77dfc8b14972c18d063b6dda8 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:13:07 +0100 Subject: add awesome sparse solver --- test/test_ot.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index dacae0a..4d59e12 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -118,6 +118,26 @@ def test_emd_empty(): np.testing.assert_allclose(w, 0) +def test_emd_sparse(): + + n = 100 + rng = np.random.RandomState(0) + + x = rng.randn(n, 2) + x2 = rng.randn(n, 2) + u = ot.utils.unif(n) + + M = ot.dist(x, x2) + + G = ot.emd([], [], M) + + Gs = ot.emd([], [], M, sparse=True) + + # check G is the same + np.testing.assert_allclose(G, Gs.todense()) + # check constraints + + def test_emd2_multi(): n = 500 # nb bins -- cgit v1.2.3 From a6a654de5e78dd388a793fbd26f60045b05d519c Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:31:32 +0100 Subject: proper documentation and parameter --- ot/lp/EMD.h | 2 +- ot/lp/EMD_wrapper.cpp | 3 ++- ot/lp/__init__.py | 16 ++++++++++++++-- ot/lp/emd_wrap.pyx | 10 ++++++---- test/test_ot.py | 2 +- 5 files changed, 24 insertions(+), 9 deletions(-) (limited to 'test/test_ot.py') diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index bc513d2..9896091 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -33,7 +33,7 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 2aa44c1..9be2cdc 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -108,7 +108,7 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) { // beware M and C anre strored in row major C style!!! @@ -202,6 +202,7 @@ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, cur++; } } + *nG=cur; // nb of value +1 for numpy indexing } diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 4fec7d9..d476071 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', 'emd_1d', 'emd2_1d', 'wasserstein_1d'] -def emd(a, b, M, numItermax=100000, log=False, sparse=False): +def emd(a, b, M, numItermax=100000, log=False, dense=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -62,6 +62,10 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -103,6 +107,8 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse= not dense + # if empty array given then use uniform distributions if len(a) == 0: a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0] @@ -128,7 +134,7 @@ def emd(a, b, M, numItermax=100000, log=False, sparse=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, sparse=False, return_matrix=False): + numItermax=100000, log=False, dense=True, return_matrix=False): r"""Solves the Earth Movers distance problem and returns the loss .. math:: @@ -166,6 +172,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), variables. Otherwise returns only the optimal transportation cost. return_matrix: boolean, optional (default=False) If True, returns the optimal transportation matrix in the log. + dense: boolean, optional (default=True) + If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). + Otherwise returns a sparse representation using scipy's `coo_matrix` + format. Returns ------- @@ -207,6 +217,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), b = np.asarray(b, dtype=np.float64) M = np.asarray(M, dtype=np.float64) + sparse=not dense + # problem with pikling Forks if sys.platform.endswith('win32'): processes=1 diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index f183995..4b6cdce 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -21,7 +21,7 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, + long *iG, long *jG, double *G, long * nG, double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -75,7 +75,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - + sparse : bool + Returning a sparse transport matrix if set to True Returns ------- @@ -87,6 +88,7 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod cdef int n2= M.shape[1] cdef int nmax=n1+n2-1 cdef int result_code = 0 + cdef int nG=0 cdef double cost=0 cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1) @@ -111,10 +113,10 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod jG=np.zeros(nmax,dtype=np.int) - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, alpha.data, beta.data, &cost, max_iter) + result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, &nG, alpha.data, beta.data, &cost, max_iter) - return Gv, iG, jG, cost, alpha, beta, result_code + return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code else: diff --git a/test/test_ot.py b/test/test_ot.py index 4d59e12..7b44fd1 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -131,7 +131,7 @@ def test_emd_sparse(): G = ot.emd([], [], M) - Gs = ot.emd([], [], M, sparse=True) + Gs = ot.emd([], [], M, dense=False) # check G is the same np.testing.assert_allclose(G, Gs.todense()) -- cgit v1.2.3 From 127adbaf4eef7a6dffbdcd4f930fc6301587f861 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 11:41:13 +0100 Subject: remove useless variable --- test/test_ot.py | 1 - 1 file changed, 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 7b44fd1..8602022 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -125,7 +125,6 @@ def test_emd_sparse(): x = rng.randn(n, 2) x2 = rng.randn(n, 2) - u = ot.utils.unif(n) M = ot.dist(x, x2) -- cgit v1.2.3 From 84384dd9e5dc78ed5cc867a53bd1de31c05d77fc Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 13:34:05 +0100 Subject: add test emd2 --- test/test_ot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 8602022..507d188 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -132,9 +132,12 @@ def test_emd_sparse(): Gs = ot.emd([], [], M, dense=False) + ws = ot.emd2([], [], M, dense=False) + # check G is the same np.testing.assert_allclose(G, Gs.todense()) - # check constraints + # check value + np.testing.assert_allclose(Gs.multiply(M).sum(), ws, rtol=1e-6) def test_emd2_multi(): -- cgit v1.2.3 From 7371b2f4f931db8f67ec2967253be8d95ff9fe80 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 13:34:55 +0100 Subject: add test emd2 --- test/test_ot.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 507d188..48ea87f 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -171,7 +171,12 @@ def test_emd2_multi(): emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') + ot.tic() + emdn2 = ot.emd2(a, b, M, dense = False) + ot.toc('multi proc : {} s') + np.testing.assert_allclose(emd1, emdn) + np.testing.assert_allclose(emd1, emdn2) # emd loss multipro proc with log ot.tic() -- cgit v1.2.3 From dfaba55affcca606e8e041bdbd0fc5a7735c2b07 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 13:36:08 +0100 Subject: add test emd2 multi --- test/test_ot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 48ea87f..470fd0f 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -176,7 +176,7 @@ def test_emd2_multi(): ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) - np.testing.assert_allclose(emd1, emdn2) + np.testing.assert_allclose(emd1, emdn2, rtol=1e-6) # emd loss multipro proc with log ot.tic() -- cgit v1.2.3 From c439e3efb920086154c741b41f65d99165e875d8 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 2 Dec 2019 13:57:13 +0100 Subject: pep8 --- test/test_ot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 470fd0f..fbacd8b 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -172,8 +172,8 @@ def test_emd2_multi(): ot.toc('multi proc : {} s') ot.tic() - emdn2 = ot.emd2(a, b, M, dense = False) - ot.toc('multi proc : {} s') + emdn2 = ot.emd2(a, b, M, dense=False) + ot.toc('multi proc : {} s') np.testing.assert_allclose(emd1, emdn) np.testing.assert_allclose(emd1, emdn2, rtol=1e-6) -- cgit v1.2.3 From 92233f79e098f1930248d815e66c0a929508af59 Mon Sep 17 00:00:00 2001 From: Kilian Date: Mon, 9 Dec 2019 15:56:48 +0100 Subject: add assert for emd dimension mismatch --- ot/lp/__init__.py | 6 ++++++ test/test_ot.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'test/test_ot.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..f77c3d7 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -109,6 +109,9 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) result_code_string = check_result(result_code) if log: @@ -212,6 +215,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] or b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + if log or return_matrix: def f(b): G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) diff --git a/test/test_ot.py b/test/test_ot.py index dacae0a..1343604 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -14,6 +14,22 @@ from ot.datasets import make_1D_gauss as gauss import pytest +def test_emd_dimension_mismatch(): + # test emd and emd2 for simple identity + n_samples = 100 + n_features = 2 + rng = np.random.RandomState(0) + + x = rng.randn(n_samples, n_features) + a = ot.utils.unif(n_samples + 1) + + M = ot.dist(x, x) + + np.testing.assert_raises(AssertionError, emd, a, a, M) + + np.testing.assert_raises(AssertionError, emd2, a, a, M) + + def test_emd_emd2(): # test emd and emd2 for simple identity n = 100 -- cgit v1.2.3 From 428b44e15591071cfcd69af365d878cfd876f9d3 Mon Sep 17 00:00:00 2001 From: Kilian Date: Mon, 9 Dec 2019 16:35:49 +0100 Subject: calling ot.emd test --- test/test_ot.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 1343604..25cdfd4 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -14,8 +14,9 @@ from ot.datasets import make_1D_gauss as gauss import pytest + def test_emd_dimension_mismatch(): - # test emd and emd2 for simple identity + # test emd and emd2 for dimension mismatch n_samples = 100 n_features = 2 rng = np.random.RandomState(0) @@ -25,9 +26,9 @@ def test_emd_dimension_mismatch(): M = ot.dist(x, x) - np.testing.assert_raises(AssertionError, emd, a, a, M) + np.testing.assert_raises(AssertionError, ot.emd, a, a, M) - np.testing.assert_raises(AssertionError, emd2, a, a, M) + np.testing.assert_raises(AssertionError, ot.emd2, a, a, M) def test_emd_emd2(): -- cgit v1.2.3 From 92dbe259032d340a259209e477e9aac74897689e Mon Sep 17 00:00:00 2001 From: Kilian Date: Mon, 9 Dec 2019 16:43:54 +0100 Subject: pep8 --- test/test_ot.py | 1 - 1 file changed, 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 25cdfd4..42a3d0a 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -14,7 +14,6 @@ from ot.datasets import make_1D_gauss as gauss import pytest - def test_emd_dimension_mismatch(): # test emd and emd2 for dimension mismatch n_samples = 100 -- cgit v1.2.3 From d97f81dd731c4b1132939500076fd48c89f19d1f Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Wed, 18 Dec 2019 10:17:31 +0100 Subject: update test --- test/test_ot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index fbacd8b..3dd544c 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -128,7 +128,7 @@ def test_emd_sparse(): M = ot.dist(x, x2) - G = ot.emd([], [], M) + G = ot.emd([], [], M, dense=True) Gs = ot.emd([], [], M, dense=False) -- cgit v1.2.3 From 3844639d3dd4e0dd360ebef34dd657d26664039e Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 09:35:19 +0100 Subject: add test for constraint viuolation of duals --- test/test_ot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 18b6294..c756e51 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -337,7 +337,10 @@ def test_dual_variables(): # Check that both cost computations are equivalent np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost']) - + + viol=log['u'][:,None]+log['v'][None,:]-M + + assert viol.max()<1e-8 def check_duality_gap(a, b, M, G, u, v, cost): cost_dual = np.vdot(a, u) + np.vdot(b, v) -- cgit v1.2.3 From 30fc233f7f62d571a562971a945d68c3782f0780 Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Mon, 27 Jan 2020 09:37:42 +0100 Subject: correct pep8 --- test/test_ot.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index c756e51..245a107 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -337,10 +337,11 @@ def test_dual_variables(): # Check that both cost computations are equivalent np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost']) - - viol=log['u'][:,None]+log['v'][None,:]-M - - assert viol.max()<1e-8 + + viol = log['u'][:, None] + log['v'][None, :] - M + + assert viol.max() < 1e-8 + def check_duality_gap(a, b, M, G, u, v, cost): cost_dual = np.vdot(a, u) + np.vdot(b, v) -- cgit v1.2.3 From f65073faa73b36280a19ff8b9c383e66f8bdbd2b Mon Sep 17 00:00:00 2001 From: Rémi Flamary Date: Thu, 30 Jan 2020 08:04:36 +0100 Subject: comlete documentation --- ot/lp/__init__.py | 30 +++++++++++++++++++----------- ot/lp/emd_wrap.pyx | 6 ++++++ test/test_ot.py | 4 ++-- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'test/test_ot.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index aa3166f..cdd505d 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -28,10 +28,10 @@ __all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', def center_ot_dual(alpha0, beta0, a=None, b=None): - r"""Center dual OT potentials wrt theirs weights + r"""Center dual OT potentials w.r.t. theirs weights The main idea of this function is to find unique dual potentials - that ensure some kind of centering/fairness. It will help have + that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having stability when multiple calling of the OT solver with small changes. Basically we add another constraint to the potential that will not @@ -91,7 +91,15 @@ def center_ot_dual(alpha0, beta0, a=None, b=None): def estimate_dual_null_weights(alpha0, beta0, a, b, M): r"""Estimate feasible values for 0-weighted dual potentials - The feasible values are computed efficiently bjt rather coarsely. + The feasible values are computed efficiently but rather coarsely. + + .. warning:: + This function is necessary because the C++ solver in emd_c + discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + First we compute the constraints violations: .. math:: @@ -113,11 +121,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 - In the end the dual potential are centred using function + In the end the dual potentials are centered using function :ref:`center_ot_dual`. Note that all those updates do not change the objective value of the - solution but provide dual potential that do not violate the constraints. + solution but provide dual potentials that do not violate the constraints. Parameters ---------- @@ -130,9 +138,9 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): beta0 : (nt,) numpy.ndarray, float64 Target dual potential a : (ns,) numpy.ndarray, float64 - Source histogram (uniform weight if empty list) + Source distribution (uniform weights if empty list) b : (nt,) numpy.ndarray, float64 - Target histogram (uniform weight if empty list) + Target distribution (uniform weights if empty list) M : (ns,nt) numpy.ndarray, float64 Loss matrix (c-order array with type float64) @@ -150,11 +158,11 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): bsel = b != 0 # compute dual constraints violation - Viol = alpha0[:, None] + beta0[None, :] - M + constraint_violation = alpha0[:, None] + beta0[None, :] - M - # Compute worst violation per line and columns - aviol = np.max(Viol, 1) - bviol = np.max(Viol, 0) + # Compute largest violation per line and columns + aviol = np.max(constraint_violation, 1) + bviol = np.max(constraint_violation, 0) # update corrects violation of alpha_up = -1 * ~asel * np.maximum(aviol, 0) diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index a4987f4..d345fd4 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -66,6 +66,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod .. warning:: Note that the M matrix needs to be a C-order :py.cls:`numpy.array` + .. warning:: + The C++ solver discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + Parameters ---------- a : (ns,) numpy.ndarray, float64 diff --git a/test/test_ot.py b/test/test_ot.py index 245a107..47df946 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -338,9 +338,9 @@ def test_dual_variables(): np.testing.assert_almost_equal(cost1, log['cost']) check_duality_gap(a, b, M, G, log['u'], log['v'], log['cost']) - viol = log['u'][:, None] + log['v'][None, :] - M + constraint_violation = log['u'][:, None] + log['v'][None, :] - M - assert viol.max() < 1e-8 + assert constraint_violation.max() < 1e-8 def check_duality_gap(a, b, M, G, u, v, cost): -- cgit v1.2.3 From 592f933085d5b521a440eb91eccc283c43732170 Mon Sep 17 00:00:00 2001 From: AdrienCorenflos Date: Wed, 1 Apr 2020 12:14:42 +0100 Subject: Fix ordering --- ot/lp/__init__.py | 2 +- test/test_ot.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'test/test_ot.py') diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index cdd505d..4c968ca 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -656,7 +656,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, perm_a = np.argsort(x_a_1d) perm_b = np.argsort(x_b_1d) - G_sorted, indices, cost = emd_1d_sorted(a, b, + G_sorted, indices, cost = emd_1d_sorted(a[perm_a.flatten()], b[perm_b.flatten()], x_a_1d[perm_a], x_b_1d[perm_b], metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), diff --git a/test/test_ot.py b/test/test_ot.py index 47df946..7afdae3 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -91,6 +91,44 @@ def test_emd_1d_emd2_1d(): with pytest.raises(AssertionError): ot.emd_1d(u, v, [], []) +def test_emd_1d_emd2_1d_with_weights(): + + # test emd1d gives similar results as emd + n = 20 + m = 30 + rng = np.random.RandomState(0) + u = rng.randn(n, 1) + v = rng.randn(m, 1) + + w_u = rng.uniform(0., 1., n) + w_u = w_u / w_u.sum() + + w_v = rng.uniform(0., 1., m) + w_v = w_v / w_v.sum() + + M = ot.dist(u, v, metric='sqeuclidean') + + G, log = ot.emd(w_u, w_v, M, log=True) + wass = log["cost"] + G_1d, log = ot.emd_1d(u, v, w_u, w_v, metric='sqeuclidean', log=True) + wass1d = log["cost"] + wass1d_emd2 = ot.emd2_1d(u, v, w_u, w_v, metric='sqeuclidean', log=False) + wass1d_euc = ot.emd2_1d(u, v, w_u, w_v, metric='euclidean', log=False) + + # check loss is similar + np.testing.assert_allclose(wass, wass1d) + np.testing.assert_allclose(wass, wass1d_emd2) + + # check loss is similar to scipy's implementation for Euclidean metric + wass_sp = wasserstein_distance(u.reshape((-1,)), v.reshape((-1,))) + np.testing.assert_allclose(wass_sp, wass1d_euc) + + # check constraints + np.testing.assert_allclose(w_u, G.sum(1)) + np.testing.assert_allclose(w_v, G.sum(0)) + + + def test_wass_1d(): # test emd1d gives similar results as emd -- cgit v1.2.3 From 1e2e118e3a30224932ed2f012bb8f9f0f374ef2c Mon Sep 17 00:00:00 2001 From: AdrienCorenflos Date: Thu, 2 Apr 2020 10:39:55 +0100 Subject: Fix test --- test/test_ot.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'test/test_ot.py') diff --git a/test/test_ot.py b/test/test_ot.py index 7afdae3..0f1357f 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -7,11 +7,11 @@ import warnings import numpy as np +import pytest from scipy.stats import wasserstein_distance import ot from ot.datasets import make_1D_gauss as gauss -import pytest def test_emd_dimension_mismatch(): @@ -75,12 +75,12 @@ def test_emd_1d_emd2_1d(): np.testing.assert_allclose(wass, wass1d_emd2) # check loss is similar to scipy's implementation for Euclidean metric - wass_sp = wasserstein_distance(u.reshape((-1, )), v.reshape((-1, ))) + wass_sp = wasserstein_distance(u.reshape((-1,)), v.reshape((-1,))) np.testing.assert_allclose(wass_sp, wass1d_euc) # check constraints - np.testing.assert_allclose(np.ones((n, )) / n, G.sum(1)) - np.testing.assert_allclose(np.ones((m, )) / m, G.sum(0)) + np.testing.assert_allclose(np.ones((n,)) / n, G.sum(1)) + np.testing.assert_allclose(np.ones((m,)) / m, G.sum(0)) # check G is similar np.testing.assert_allclose(G, G_1d) @@ -91,8 +91,8 @@ def test_emd_1d_emd2_1d(): with pytest.raises(AssertionError): ot.emd_1d(u, v, [], []) -def test_emd_1d_emd2_1d_with_weights(): +def test_emd_1d_emd2_1d_with_weights(): # test emd1d gives similar results as emd n = 20 m = 30 @@ -120,7 +120,7 @@ def test_emd_1d_emd2_1d_with_weights(): np.testing.assert_allclose(wass, wass1d_emd2) # check loss is similar to scipy's implementation for Euclidean metric - wass_sp = wasserstein_distance(u.reshape((-1,)), v.reshape((-1,))) + wass_sp = wasserstein_distance(u.reshape((-1,)), v.reshape((-1,)), w_u, w_v) np.testing.assert_allclose(wass_sp, wass1d_euc) # check constraints @@ -128,8 +128,6 @@ def test_emd_1d_emd2_1d_with_weights(): np.testing.assert_allclose(w_v, G.sum(0)) - - def test_wass_1d(): # test emd1d gives similar results as emd n = 20 @@ -173,7 +171,6 @@ def test_emd_empty(): def test_emd_sparse(): - n = 100 rng = np.random.RandomState(0) @@ -249,7 +246,6 @@ def test_emd2_multi(): def test_lp_barycenter(): - a1 = np.array([1.0, 0, 0])[:, None] a2 = np.array([0, 0, 1.0])[:, None] @@ -266,7 +262,6 @@ def test_lp_barycenter(): def test_free_support_barycenter(): - measures_locations = [np.array([-1.]).reshape((1, 1)), np.array([1.]).reshape((1, 1))] measures_weights = [np.array([1.]), np.array([1.])] @@ -282,7 +277,6 @@ def test_free_support_barycenter(): @pytest.mark.skipif(not ot.lp.cvx.cvxopt, reason="No cvxopt available") def test_lp_barycenter_cvxopt(): - a1 = np.array([1.0, 0, 0])[:, None] a2 = np.array([0, 0, 1.0])[:, None] -- cgit v1.2.3 From ef12867f1425ee86b3cfddef4287b52d46114e83 Mon Sep 17 00:00:00 2001 From: Nicolas Courty Date: Thu, 23 Apr 2020 13:03:28 +0200 Subject: [WIP] Issue with sparse emd and adding tests on macos (#158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * First commit-warning removal * remove dense feature * pep8 * pep8 * EMD.h * pep8 again * tic toc tolerance Co-authored-by: Rémi Flamary --- .github/workflows/pythonpackage.yml | 48 +++++----- ot/lp/EMD.h | 3 - ot/lp/EMD_wrapper.cpp | 182 ------------------------------------ ot/lp/__init__.py | 45 +++------ ot/lp/emd_wrap.pyx | 38 ++------ ot/lp/network_simplex_simple.h | 5 +- test/test_ot.py | 26 ------ test/test_utils.py | 4 +- 8 files changed, 46 insertions(+), 305 deletions(-) (limited to 'test/test_ot.py') diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 394f453..9c35afa 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -47,31 +47,31 @@ jobs: run: | codecov - # macos: - # runs-on: macOS-latest - # strategy: - # max-parallel: 4 - # matrix: - # python-version: [3.7] + macos: + runs-on: macOS-latest + strategy: + max-parallel: 4 + matrix: + python-version: [3.7] - # steps: - # - uses: actions/checkout@v1 - # - name: Set up Python ${{ matrix.python-version }} - # uses: actions/setup-python@v1 - # with: - # python-version: ${{ matrix.python-version }} - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install -r requirements.txt - # pip install pytest "pytest-cov<2.6" - # pip install -U "sklearn" - # - name: Install POT - # run: | - # pip install -e . - # - name: Run tests - # run: | - # python -m pytest -v test/ ot/ --doctest-modules --ignore ot/gpu/ --cov=ot + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest "pytest-cov<2.6" + pip install -U "sklearn" + - name: Install POT + run: | + pip install -e . + - name: Run tests + run: | + python -m pytest -v test/ ot/ --doctest-modules --ignore ot/gpu/ --cov=ot windows: diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index 2adaace..c0fe7a3 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -32,9 +32,6 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); -int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter); #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index 28e4af2..bc873ed 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -106,185 +106,3 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, return ret; } - -int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) { - // beware M and C anre strored in row major C style!!! - - // Get the number of non zero coordinates for r and c and vectors - int n, m, i, cur; - - typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - - // Get the number of non zero coordinates for r and c - n=0; - for (int i=0; i0) { - n++; - }else if(val<0){ - return INFEASIBLE; - } - } - m=0; - for (int i=0; i0) { - m++; - }else if(val<0){ - return INFEASIBLE; - } - } - - // Define the graph - - std::vector indI(n), indJ(m); - std::vector weights1(n), weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Set supply and demand, don't account for 0 values (faster) - - cur=0; - for (int i=0; i0) { - weights1[ cur ] = val; - indI[cur++]=i; - } - } - - // Demand is actually negative supply... - - cur=0; - for (int i=0; i0) { - weights2[ cur ] = -val; - indJ[cur++]=i; - } - } - - // Define the graph - net.supplyMap(&weights1[0], n, &weights2[0], m); - - // Set the cost of each edge - for (int i=0; i0) - { - *cost += flow * (*(D+indI[i]*n2+indJ[j-n])); - - *(G+cur) = flow; - *(iG+cur) = indI[i]; - *(jG+cur) = indJ[j-n]; - *(alpha + indI[i]) = -net.potential(i); - *(beta + indJ[j-n]) = net.potential(j); - cur++; - } - } - *nG=cur; // nb of value +1 for numpy indexing - - } - - - return ret; -} - -int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y, - long *iD, long *jD, double *D, long nD, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) { - // beware M and C anre strored in row major C style!!! - - // Get the number of non zero coordinates for r and c and vectors - int n, m, cur; - - typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - - n=n1; - m=n2; - - - // Define the graph - - - std::vector weights2(m); - Digraph di(n, m); - NetworkSimplexSimple net(di, true, n+m, n*m, maxIter); - - // Set supply and demand, don't account for 0 values (faster) - - - // Demand is actually negative supply... - - cur=0; - for (int i=0; i0) { - weights2[ cur ] = -val; - } - } - - // Define the graph - net.supplyMap(X, n, &weights2[0], m); - - // Set the cost of each edge - for (int k=0; k0) - { - - *(G+cur) = flow; - *(iG+cur) = i; - *(jG+cur) = j-n; - *(alpha + i) = -net.potential(i); - *(beta + j-n) = net.potential(j); - cur++; - } - } - *nG=cur; // nb of value +1 for numpy indexing - - } - - - return ret; -} - diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 8d1baa0..ad390c5 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -172,7 +172,7 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M): return center_ot_dual(alpha, beta, a, b) -def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): +def emd(a, b, M, numItermax=100000, log=False, center_dual=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -207,10 +207,6 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. - dense: boolean, optional (default=True) - If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt). - Otherwise returns a sparse representation using scipy's `coo_matrix` - format. center_dual: boolean, optional (default=True) If True, centers the dual potential using function :ref:`center_ot_dual`. @@ -267,25 +263,14 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): asel = a != 0 bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - - if center_dual: - u, v = center_ot_dual(u, v, a, b) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) - if np.any(~asel) or np.any(~bsel): - u, v = estimate_dual_null_weights(u, v, a, b, M) - - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) - - if center_dual: - u, v = center_ot_dual(u, v, a, b) - - if np.any(~asel) or np.any(~bsel): - u, v = estimate_dual_null_weights(u, v, a, b, M) + if center_dual: + u, v = center_ot_dual(u, v, a, b) + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) if log: log = {} @@ -299,7 +284,7 @@ def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, dense=True, return_matrix=False, + numItermax=100000, log=False, return_matrix=False, center_dual=True): r"""Solves the Earth Movers distance problem and returns the loss @@ -404,11 +389,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if log or return_matrix: def f(b): bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) if center_dual: u, v = center_ot_dual(u, v, a, b) @@ -428,11 +410,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), else: def f(b): bsel = b != 0 - if dense: - G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - else: - Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense) - G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0])) + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) if center_dual: u, v = center_ot_dual(u, v, a, b) @@ -440,7 +418,6 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if np.any(~asel) or np.any(~bsel): u, v = estimate_dual_null_weights(u, v, a, b, M) - result_code_string = check_result(result_code) check_result(result_code) return cost diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index d345fd4..b6bda47 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -20,9 +20,6 @@ import warnings cdef extern from "EMD.h": int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) - int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D, - long *iG, long *jG, double *G, long * nG, - double* alpha, double* beta, double *cost, int maxIter) cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -38,13 +35,10 @@ def check_result(result_code): message = "numItermax reached before optimality. Try to increase numItermax." warnings.warn(message) return message - - - - + @cython.boundscheck(False) @cython.wraparound(False) -def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense): +def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter): """ Solves the Earth Movers distance problem and returns the optimal transport matrix @@ -83,8 +77,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod max_iter : int The maximum number of iterations before stopping the optimization algorithm if it has not converged. - dense : bool - Return a sparse transport matrix if set to False Returns ------- @@ -114,29 +106,13 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 - if dense: - # init OT matrix - G=np.zeros([n1, n2]) - - # calling the function - result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) - - return G, cost, alpha, beta, result_code - - - else: - - # init sparse OT matrix - Gv=np.zeros(nmax) - iG=np.zeros(nmax,dtype=np.int) - jG=np.zeros(nmax,dtype=np.int) - - - result_code = EMD_wrap_return_sparse(n1, n2, a.data, b.data, M.data, iG.data, jG.data, Gv.data, &nG, alpha.data, beta.data, &cost, max_iter) - + # init OT matrix + G=np.zeros([n1, n2]) - return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code + # calling the function + result_code = EMD_wrap(n1, n2, a.data, b.data, M.data, G.data, alpha.data, beta.data, &cost, max_iter) + return G, cost, alpha, beta, result_code @cython.boundscheck(False) diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h index 498e921..5d93040 100644 --- a/ot/lp/network_simplex_simple.h +++ b/ot/lp/network_simplex_simple.h @@ -875,7 +875,7 @@ namespace lemon { c += Number(it->second) * Number(_cost[it->first]); return c;*/ - for (int i=0; i<_flow.size(); i++) + for (unsigned long i=0; i<_flow.size(); i++) c += _flow[i] * Number(_cost[i]); return c; @@ -1257,7 +1257,7 @@ namespace lemon { u = w; } _pred[u_in] = in_arc; - _forward[u_in] = (u_in == _source[in_arc]); + _forward[u_in] = ((unsigned int)u_in == _source[in_arc]); _succ_num[u_in] = old_succ_num; // Set limits for updating _last_succ form v_in and v_out @@ -1418,7 +1418,6 @@ namespace lemon { template ProblemType start() { PivotRuleImpl pivot(*this); - double prevCost=-1; ProblemType retVal = OPTIMAL; // Perform heuristic initial pivots diff --git a/test/test_ot.py b/test/test_ot.py index 0f1357f..b7306f6 100644 --- a/test/test_ot.py +++ b/test/test_ot.py @@ -170,27 +170,6 @@ def test_emd_empty(): np.testing.assert_allclose(w, 0) -def test_emd_sparse(): - n = 100 - rng = np.random.RandomState(0) - - x = rng.randn(n, 2) - x2 = rng.randn(n, 2) - - M = ot.dist(x, x2) - - G = ot.emd([], [], M, dense=True) - - Gs = ot.emd([], [], M, dense=False) - - ws = ot.emd2([], [], M, dense=False) - - # check G is the same - np.testing.assert_allclose(G, Gs.todense()) - # check value - np.testing.assert_allclose(Gs.multiply(M).sum(), ws, rtol=1e-6) - - def test_emd2_multi(): n = 500 # nb bins @@ -222,12 +201,7 @@ def test_emd2_multi(): emdn = ot.emd2(a, b, M) ot.toc('multi proc : {} s') - ot.tic() - emdn2 = ot.emd2(a, b, M, dense=False) - ot.toc('multi proc : {} s') - np.testing.assert_allclose(emd1, emdn) - np.testing.assert_allclose(emd1, emdn2, rtol=1e-6) # emd loss multipro proc with log ot.tic() diff --git a/test/test_utils.py b/test/test_utils.py index 640598d..db9cda6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -36,10 +36,10 @@ def test_tic_toc(): t2 = ot.toq() # test timing - np.testing.assert_allclose(0.5, t, rtol=1e-2, atol=1e-2) + np.testing.assert_allclose(0.5, t, rtol=1e-1, atol=1e-1) # test toc vs toq - np.testing.assert_allclose(t, t2, rtol=1e-2, atol=1e-2) + np.testing.assert_allclose(t, t2, rtol=1e-1, atol=1e-1) def test_kernel(): -- cgit v1.2.3