summaryrefslogtreecommitdiff
path: root/ot/lp
diff options
context:
space:
mode:
Diffstat (limited to 'ot/lp')
-rw-r--r--ot/lp/EMD.h5
-rw-r--r--ot/lp/EMD_wrapper.cpp191
-rw-r--r--ot/lp/__init__.py602
-rw-r--r--ot/lp/emd_wrap.pyx150
-rw-r--r--ot/lp/network_simplex_simple.h2
5 files changed, 895 insertions, 55 deletions
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index f42e222..2adaace 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -32,4 +32,9 @@ enum ProblemType {
int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter);
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter);
+
+
#endif
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index fc7ca63..28e4af2 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -17,13 +17,13 @@
int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
double* alpha, double* beta, double *cost, int maxIter) {
-// beware M and C anre strored in row major C style!!!
- int n, m, i, cur;
+ // beware M and C anre strored in row major C style!!!
+ int n, m, i, cur;
typedef FullBipartiteDigraph Digraph;
- DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
- // Get the number of non zero coordinates for r and c
+ // Get the number of non zero coordinates for r and c
n=0;
for (int i=0; i<n1; i++) {
double val=*(X+i);
@@ -105,3 +105,186 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
return ret;
}
+
+
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter) {
+ // beware M and C anre strored in row major C style!!!
+
+ // Get the number of non zero coordinates for r and c and vectors
+ int n, m, i, cur;
+
+ typedef FullBipartiteDigraph Digraph;
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+ // Get the number of non zero coordinates for r and c
+ n=0;
+ for (int i=0; i<n1; i++) {
+ double val=*(X+i);
+ if (val>0) {
+ n++;
+ }else if(val<0){
+ return INFEASIBLE;
+ }
+ }
+ m=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ m++;
+ }else if(val<0){
+ return INFEASIBLE;
+ }
+ }
+
+ // Define the graph
+
+ std::vector<int> indI(n), indJ(m);
+ std::vector<double> weights1(n), weights2(m);
+ Digraph di(n, m);
+ NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+ // Set supply and demand, don't account for 0 values (faster)
+
+ cur=0;
+ for (int i=0; i<n1; i++) {
+ double val=*(X+i);
+ if (val>0) {
+ weights1[ cur ] = val;
+ indI[cur++]=i;
+ }
+ }
+
+ // Demand is actually negative supply...
+
+ cur=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ weights2[ cur ] = -val;
+ indJ[cur++]=i;
+ }
+ }
+
+ // Define the graph
+ net.supplyMap(&weights1[0], n, &weights2[0], m);
+
+ // Set the cost of each edge
+ for (int i=0; i<n; i++) {
+ for (int j=0; j<m; j++) {
+ double val=*(D+indI[i]*n2+indJ[j]);
+ net.setCost(di.arcFromId(i*m+j), val);
+ }
+ }
+
+
+ // Solve the problem with the network simplex algorithm
+
+ int ret=net.run();
+ if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+ *cost = 0;
+ Arc a; di.first(a);
+ cur=0;
+ for (; a != INVALID; di.next(a)) {
+ int i = di.source(a);
+ int j = di.target(a);
+ double flow = net.flow(a);
+ if (flow>0)
+ {
+ *cost += flow * (*(D+indI[i]*n2+indJ[j-n]));
+
+ *(G+cur) = flow;
+ *(iG+cur) = indI[i];
+ *(jG+cur) = indJ[j-n];
+ *(alpha + indI[i]) = -net.potential(i);
+ *(beta + indJ[j-n]) = net.potential(j);
+ cur++;
+ }
+ }
+ *nG=cur; // nb of value +1 for numpy indexing
+
+ }
+
+
+ return ret;
+}
+
+int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y,
+ long *iD, long *jD, double *D, long nD,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter) {
+ // beware M and C anre strored in row major C style!!!
+
+ // Get the number of non zero coordinates for r and c and vectors
+ int n, m, cur;
+
+ typedef FullBipartiteDigraph Digraph;
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+ n=n1;
+ m=n2;
+
+
+ // Define the graph
+
+
+ std::vector<double> weights2(m);
+ Digraph di(n, m);
+ NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+ // Set supply and demand, don't account for 0 values (faster)
+
+
+ // Demand is actually negative supply...
+
+ cur=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ weights2[ cur ] = -val;
+ }
+ }
+
+ // Define the graph
+ net.supplyMap(X, n, &weights2[0], m);
+
+ // Set the cost of each edge
+ for (int k=0; k<nD; k++) {
+ int i = iD[k];
+ int j = jD[k];
+ net.setCost(di.arcFromId(i*m+j), D[k]);
+
+ }
+
+
+ // Solve the problem with the network simplex algorithm
+
+ int ret=net.run();
+ if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+ *cost = net.totalCost();
+ Arc a; di.first(a);
+ cur=0;
+ for (; a != INVALID; di.next(a)) {
+ int i = di.source(a);
+ int j = di.target(a);
+ double flow = net.flow(a);
+ if (flow>0)
+ {
+
+ *(G+cur) = flow;
+ *(iG+cur) = i;
+ *(jG+cur) = j-n;
+ *(alpha + i) = -net.potential(i);
+ *(beta + j-n) = net.potential(j);
+ cur++;
+ }
+ }
+ *nG=cur; // nb of value +1 for numpy indexing
+
+ }
+
+
+ return ret;
+}
+
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 02cbd8c..cdd505d 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
"""
Solvers for the original linear program OT problem
+
+
+
"""
# Author: Remi Flamary <remi.flamary@unice.fr>
@@ -8,22 +11,171 @@ Solvers for the original linear program OT problem
# License: MIT License
import multiprocessing
-
+import sys
import numpy as np
+from scipy.sparse import coo_matrix
from .import cvx
# import compiled emd
-from .emd_wrap import emd_c, check_result
+from .emd_wrap import emd_c, check_result, emd_1d_sorted
from ..utils import parmap
from .cvx import barycenter
from ..utils import dist
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
+__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+ 'emd_1d', 'emd2_1d', 'wasserstein_1d']
-def emd(a, b, M, numItermax=100000, log=False):
- """Solves the Earth Movers distance problem and returns the OT matrix
+def center_ot_dual(alpha0, beta0, a=None, b=None):
+ r"""Center dual OT potentials w.r.t. theirs weights
+
+ The main idea of this function is to find unique dual potentials
+ that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having
+ stability when multiple calling of the OT solver with small changes.
+
+ Basically we add another constraint to the potential that will not
+ change the objective value but will ensure unicity. The constraint
+ is the following:
+
+ .. math::
+ \alpha^T a= \beta^T b
+
+ in addition to the OT problem constraints.
+
+ since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing
+ a constant from both :math:`\alpha_0` and :math:`\beta_0`.
+
+ .. math::
+ c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta}
+
+ \alpha=\alpha_0+c
+
+ \beta=\beta0+c
+
+ Parameters
+ ----------
+ alpha0 : (ns,) numpy.ndarray, float64
+ Source dual potential
+ beta0 : (nt,) numpy.ndarray, float64
+ Target dual potential
+ a : (ns,) numpy.ndarray, float64
+ Source histogram (uniform weight if empty list)
+ b : (nt,) numpy.ndarray, float64
+ Target histogram (uniform weight if empty list)
+
+ Returns
+ -------
+ alpha : (ns,) numpy.ndarray, float64
+ Source centered dual potential
+ beta : (nt,) numpy.ndarray, float64
+ Target centered dual potential
+
+ """
+ # if no weights are provided, use uniform
+ if a is None:
+ a = np.ones(alpha0.shape[0]) / alpha0.shape[0]
+ if b is None:
+ b = np.ones(beta0.shape[0]) / beta0.shape[0]
+
+ # compute constant that balances the weighted sums of the duals
+ c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum())
+
+ # update duals
+ alpha = alpha0 + c
+ beta = beta0 - c
+
+ return alpha, beta
+
+
+def estimate_dual_null_weights(alpha0, beta0, a, b, M):
+ r"""Estimate feasible values for 0-weighted dual potentials
+
+ The feasible values are computed efficiently but rather coarsely.
+
+ .. warning::
+ This function is necessary because the C++ solver in emd_c
+ discards all samples in the distributions with
+ zeros weights. This means that while the primal variable (transport
+ matrix) is exact, the solver only returns feasible dual potentials
+ on the samples with weights different from zero.
+
+ First we compute the constraints violations:
+
+ .. math::
+ V=\alpha+\beta^T-M
+
+ Next we compute the max amount of violation per row (alpha) and
+ columns (beta)
+
+ .. math::
+ v^a_i=\max_j V_{i,j}
+
+ v^b_j=\max_i V_{i,j}
+
+ Finally we update the dual potential with 0 weights if a
+ constraint is violated
+
+ .. math::
+ \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0
+
+ \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0
+
+ In the end the dual potentials are centered using function
+ :ref:`center_ot_dual`.
+
+ Note that all those updates do not change the objective value of the
+ solution but provide dual potentials that do not violate the constraints.
+
+ Parameters
+ ----------
+ alpha0 : (ns,) numpy.ndarray, float64
+ Source dual potential
+ beta0 : (nt,) numpy.ndarray, float64
+ Target dual potential
+ alpha0 : (ns,) numpy.ndarray, float64
+ Source dual potential
+ beta0 : (nt,) numpy.ndarray, float64
+ Target dual potential
+ a : (ns,) numpy.ndarray, float64
+ Source distribution (uniform weights if empty list)
+ b : (nt,) numpy.ndarray, float64
+ Target distribution (uniform weights if empty list)
+ M : (ns,nt) numpy.ndarray, float64
+ Loss matrix (c-order array with type float64)
+
+ Returns
+ -------
+ alpha : (ns,) numpy.ndarray, float64
+ Source corrected dual potential
+ beta : (nt,) numpy.ndarray, float64
+ Target corrected dual potential
+
+ """
+
+ # binary indexing of non-zeros weights
+ asel = a != 0
+ bsel = b != 0
+
+ # compute dual constraints violation
+ constraint_violation = alpha0[:, None] + beta0[None, :] - M
+
+ # Compute largest violation per line and columns
+ aviol = np.max(constraint_violation, 1)
+ bviol = np.max(constraint_violation, 0)
+
+ # update corrects violation of
+ alpha_up = -1 * ~asel * np.maximum(aviol, 0)
+ beta_up = -1 * ~bsel * np.maximum(bviol, 0)
+
+ alpha = alpha0 + alpha_up
+ beta = beta0 + beta_up
+
+ return center_ot_dual(alpha, beta, a, b)
+
+
+def emd(a, b, M, numItermax=100000, log=False, dense=True, center_dual=True):
+ r"""Solves the Earth Movers distance problem and returns the OT matrix
.. math::
@@ -37,26 +189,37 @@ def emd(a, b, M, numItermax=100000, log=False):
- M is the metric cost matrix
- a and b are the sample weights
+ .. warning::
+ Note that the M matrix needs to be a C-order numpy.array in float64
+ format.
+
Uses the algorithm proposed in [1]_
Parameters
----------
- a : (ns,) ndarray, float64
- Source histogram (uniform weigth if empty list)
- b : (nt,) ndarray, float64
- Target histogram (uniform weigth if empty list)
- M : (ns,nt) ndarray, float64
- loss matrix
+ a : (ns,) numpy.ndarray, float64
+ Source histogram (uniform weight if empty list)
+ b : (nt,) numpy.ndarray, float64
+ Target histogram (uniform weight if empty list)
+ M : (ns,nt) numpy.ndarray, float64
+ Loss matrix (c-order array with type float64)
numItermax : int, optional (default=100000)
The maximum number of iterations before stopping the optimization
algorithm if it has not converged.
- log: boolean, optional (default=False)
+ log: bool, optional (default=False)
If True, returns a dictionary containing the cost and dual
variables. Otherwise returns only the optimal transportation matrix.
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format.
+ center_dual: boolean, optional (default=True)
+ If True, centers the dual potential using function
+ :ref:`center_ot_dual`.
Returns
-------
- gamma: (ns x nt) ndarray
+ gamma: (ns x nt) numpy.ndarray
Optimal transportation matrix for the given parameters
log: dict
If input log is true, a dictionary containing the cost and dual
@@ -74,8 +237,8 @@ def emd(a, b, M, numItermax=100000, log=False):
>>> b=[.5,.5]
>>> M=[[0.,1.],[1.,0.]]
>>> ot.emd(a,b,M)
- array([[ 0.5, 0. ],
- [ 0. , 0.5]])
+ array([[0.5, 0. ],
+ [0. , 0.5]])
References
----------
@@ -94,13 +257,37 @@ def emd(a, b, M, numItermax=100000, log=False):
b = np.asarray(b, dtype=np.float64)
M = np.asarray(M, dtype=np.float64)
- # if empty array given then use unifor distributions
+ # if empty array given then use uniform distributions
if len(a) == 0:
a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
if len(b) == 0:
b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
- G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+ assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+ "Dimension mismatch, check dimensions of M with a and b"
+
+ asel = a != 0
+ bsel = b != 0
+
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+
+ if center_dual:
+ u, v = center_ot_dual(u, v, a, b)
+
+ if np.any(~asel) or np.any(~bsel):
+ u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+ if center_dual:
+ u, v = center_ot_dual(u, v, a, b)
+
+ if np.any(~asel) or np.any(~bsel):
+ u, v = estimate_dual_null_weights(u, v, a, b, M)
+
result_code_string = check_result(result_code)
if log:
log = {}
@@ -114,8 +301,9 @@ def emd(a, b, M, numItermax=100000, log=False):
def emd2(a, b, M, processes=multiprocessing.cpu_count(),
- numItermax=100000, log=False, return_matrix=False):
- """Solves the Earth Movers distance problem and returns the loss
+ numItermax=100000, log=False, dense=True, return_matrix=False,
+ center_dual=True):
+ r"""Solves the Earth Movers distance problem and returns the loss
.. math::
\gamma = arg\min_\gamma <\gamma,M>_F
@@ -128,16 +316,22 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
- M is the metric cost matrix
- a and b are the sample weights
+ .. warning::
+ Note that the M matrix needs to be a C-order numpy.array in float64
+ format.
+
Uses the algorithm proposed in [1]_
Parameters
----------
- a : (ns,) ndarray, float64
- Source histogram (uniform weigth if empty list)
- b : (nt,) ndarray, float64
- Target histogram (uniform weigth if empty list)
- M : (ns,nt) ndarray, float64
- loss matrix
+ a : (ns,) numpy.ndarray, float64
+ Source histogram (uniform weight if empty list)
+ b : (nt,) numpy.ndarray, float64
+ Target histogram (uniform weight if empty list)
+ M : (ns,nt) numpy.ndarray, float64
+ Loss matrix (c-order array with type float64)
+ processes : int, optional (default=nb cpu)
+ Nb of processes used for multiple emd computation (not used on windows)
numItermax : int, optional (default=100000)
The maximum number of iterations before stopping the optimization
algorithm if it has not converged.
@@ -146,12 +340,19 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
variables. Otherwise returns only the optimal transportation cost.
return_matrix: boolean, optional (default=False)
If True, returns the optimal transportation matrix in the log.
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format.
+ center_dual: boolean, optional (default=True)
+ If True, centers the dual potential using function
+ :ref:`center_ot_dual`.
Returns
-------
gamma: (ns x nt) ndarray
Optimal transportation matrix for the given parameters
- log: dict
+ log: dictnp
If input log is true, a dictionary containing the cost and dual
variables and exit status
@@ -187,27 +388,61 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
b = np.asarray(b, dtype=np.float64)
M = np.asarray(M, dtype=np.float64)
- # if empty array given then use unifor distributions
+ # problem with pikling Forks
+ if sys.platform.endswith('win32'):
+ processes = 1
+
+ # if empty array given then use uniform distributions
if len(a) == 0:
a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
if len(b) == 0:
b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1]
+ assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
+ "Dimension mismatch, check dimensions of M with a and b"
+
+ asel = a != 0
+
if log or return_matrix:
def f(b):
- G, cost, u, v, resultCode = emd_c(a, b, M, numItermax)
- result_code_string = check_result(resultCode)
+ bsel = b != 0
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+ if center_dual:
+ u, v = center_ot_dual(u, v, a, b)
+
+ if np.any(~asel) or np.any(~bsel):
+ u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+ result_code_string = check_result(result_code)
log = {}
if return_matrix:
log['G'] = G
log['u'] = u
log['v'] = v
log['warning'] = result_code_string
- log['result_code'] = resultCode
+ log['result_code'] = result_code
return [cost, log]
else:
def f(b):
- G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+ bsel = b != 0
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax, dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+ if center_dual:
+ u, v = center_ot_dual(u, v, a, b)
+
+ if np.any(~asel) or np.any(~bsel):
+ u, v = estimate_dual_null_weights(u, v, a, b, M)
+
+ result_code_string = check_result(result_code)
check_result(result_code)
return cost
@@ -215,9 +450,12 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
return f(b)
nb = b.shape[1]
- res = parmap(f, [b[:, i] for i in range(nb)], processes)
- return res
+ if processes > 1:
+ res = parmap(f, [b[:, i] for i in range(nb)], processes)
+ else:
+ res = list(map(f, [b[:, i].copy() for i in range(nb)]))
+ return res
def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None):
@@ -231,9 +469,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
Parameters
----------
- measures_locations : list of (k_i,d) np.ndarray
+ measures_locations : list of (k_i,d) numpy.ndarray
The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
- measures_weights : list of (k_i,) np.ndarray
+ measures_weights : list of (k_i,) numpy.ndarray
Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
X_init : (k,d) np.ndarray
@@ -246,7 +484,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
numItermax : int, optional
Max number of iterations
stopThr : float, optional
- Stop threshol on error (>0)
+ Stop threshold on error (>0)
verbose : bool, optional
Print information along iterations
log : bool, optional
@@ -272,7 +510,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
k = X_init.shape[0]
d = X_init.shape[1]
if b is None:
- b = np.ones((k,))/k
+ b = np.ones((k,)) / k
if weights is None:
weights = np.ones((N,)) / N
@@ -283,7 +521,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
displacement_square_norm = stopThr + 1.
- while ( displacement_square_norm > stopThr and iter_count < numItermax ):
+ while (displacement_square_norm > stopThr and iter_count < numItermax):
T_sum = np.zeros((k, d))
@@ -293,7 +531,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
T_i = emd(b, measure_weights_i, M_i)
T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
- displacement_square_norm = np.sum(np.square(T_sum-X))
+ displacement_square_norm = np.sum(np.square(T_sum - X))
if log:
displacement_square_norms.append(displacement_square_norm)
@@ -308,4 +546,288 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
log_dict['displacement_square_norms'] = displacement_square_norms
return X, log_dict
else:
- return X \ No newline at end of file
+ return X
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+ log=False):
+ r"""Solves the Earth Movers distance problem between 1d measures and returns
+ the OT matrix
+
+
+ .. math::
+ \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+ s.t. \gamma 1 = a,
+ \gamma^T 1= b,
+ \gamma\geq 0
+ where :
+
+ - d is the metric
+ - x_a and x_b are the samples
+ - a and b are the sample weights
+
+ When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+ Uses the algorithm detailed in [1]_
+
+ Parameters
+ ----------
+ x_a : (ns,) or (ns, 1) ndarray, float64
+ Source dirac locations (on the real line)
+ x_b : (nt,) or (ns, 1) ndarray, float64
+ Target dirac locations (on the real line)
+ a : (ns,) ndarray, float64, optional
+ Source histogram (default is uniform weight)
+ b : (nt,) ndarray, float64, optional
+ Target histogram (default is uniform weight)
+ metric: str, optional (default='sqeuclidean')
+ Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+ Due to implementation details, this function runs faster when
+ `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format. Due to implementation details, this function runs faster when
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ log: boolean, optional (default=False)
+ If True, returns a dictionary containing the cost.
+ Otherwise returns only the optimal transportation matrix.
+
+ Returns
+ -------
+ gamma: (ns, nt) ndarray
+ Optimal transportation matrix for the given parameters
+ log: dict
+ If input log is True, a dictionary containing the cost
+
+
+ Examples
+ --------
+
+ Simple example with obvious solution. The function emd_1d accepts lists and
+ performs automatic conversion to numpy arrays
+
+ >>> import ot
+ >>> a=[.5, .5]
+ >>> b=[.5, .5]
+ >>> x_a = [2., 0.]
+ >>> x_b = [0., 3.]
+ >>> ot.emd_1d(x_a, x_b, a, b)
+ array([[0. , 0.5],
+ [0.5, 0. ]])
+ >>> ot.emd_1d(x_a, x_b)
+ array([[0. , 0.5],
+ [0.5, 0. ]])
+
+ References
+ ----------
+
+ .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+ Transport", 2018.
+
+ See Also
+ --------
+ ot.lp.emd : EMD for multidimensional distributions
+ ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+ transportation matrix)
+ """
+ a = np.asarray(a, dtype=np.float64)
+ b = np.asarray(b, dtype=np.float64)
+ x_a = np.asarray(x_a, dtype=np.float64)
+ x_b = np.asarray(x_b, dtype=np.float64)
+
+ assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+ "emd_1d should only be used with monodimensional data"
+ assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+ "emd_1d should only be used with monodimensional data"
+
+ # if empty array given then use uniform distributions
+ if a.ndim == 0 or len(a) == 0:
+ a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
+ if b.ndim == 0 or len(b) == 0:
+ b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+
+ x_a_1d = x_a.reshape((-1, ))
+ x_b_1d = x_b.reshape((-1, ))
+ perm_a = np.argsort(x_a_1d)
+ perm_b = np.argsort(x_b_1d)
+
+ G_sorted, indices, cost = emd_1d_sorted(a, b,
+ x_a_1d[perm_a], x_b_1d[perm_b],
+ metric=metric, p=p)
+ G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
+ shape=(a.shape[0], b.shape[0]))
+ if dense:
+ G = G.toarray()
+ if log:
+ log = {'cost': cost}
+ return G, log
+ return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+ log=False):
+ r"""Solves the Earth Movers distance problem between 1d measures and returns
+ the loss
+
+
+ .. math::
+ \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+ s.t. \gamma 1 = a,
+ \gamma^T 1= b,
+ \gamma\geq 0
+ where :
+
+ - d is the metric
+ - x_a and x_b are the samples
+ - a and b are the sample weights
+
+ When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+ Uses the algorithm detailed in [1]_
+
+ Parameters
+ ----------
+ x_a : (ns,) or (ns, 1) ndarray, float64
+ Source dirac locations (on the real line)
+ x_b : (nt,) or (ns, 1) ndarray, float64
+ Target dirac locations (on the real line)
+ a : (ns,) ndarray, float64, optional
+ Source histogram (default is uniform weight)
+ b : (nt,) ndarray, float64, optional
+ Target histogram (default is uniform weight)
+ metric: str, optional (default='sqeuclidean')
+ Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+ Due to implementation details, this function runs faster when
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format. Only used if log is set to True. Due to implementation details,
+ this function runs faster when dense is set to False.
+ log: boolean, optional (default=False)
+ If True, returns a dictionary containing the transportation matrix.
+ Otherwise returns only the loss.
+
+ Returns
+ -------
+ loss: float
+ Cost associated to the optimal transportation
+ log: dict
+ If input log is True, a dictionary containing the Optimal transportation
+ matrix for the given parameters
+
+
+ Examples
+ --------
+
+ Simple example with obvious solution. The function emd2_1d accepts lists and
+ performs automatic conversion to numpy arrays
+
+ >>> import ot
+ >>> a=[.5, .5]
+ >>> b=[.5, .5]
+ >>> x_a = [2., 0.]
+ >>> x_b = [0., 3.]
+ >>> ot.emd2_1d(x_a, x_b, a, b)
+ 0.5
+ >>> ot.emd2_1d(x_a, x_b)
+ 0.5
+
+ References
+ ----------
+
+ .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+ Transport", 2018.
+
+ See Also
+ --------
+ ot.lp.emd2 : EMD for multidimensional distributions
+ ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+ instead of the cost)
+ """
+ # If we do not return G (log==False), then we should not to cast it to dense
+ # (useless overhead)
+ G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+ dense=dense and log, log=True)
+ cost = log_emd['cost']
+ if log:
+ log_emd = {'G': G}
+ return cost, log_emd
+ return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
+ r"""Solves the p-Wasserstein distance problem between 1d measures and returns
+ the distance
+
+ .. math::
+ \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
+
+ s.t. \gamma 1 = a,
+ \gamma^T 1= b,
+ \gamma\geq 0
+
+ where :
+
+ - x_a and x_b are the samples
+ - a and b are the sample weights
+
+ Uses the algorithm detailed in [1]_
+
+ Parameters
+ ----------
+ x_a : (ns,) or (ns, 1) ndarray, float64
+ Source dirac locations (on the real line)
+ x_b : (nt,) or (ns, 1) ndarray, float64
+ Target dirac locations (on the real line)
+ a : (ns,) ndarray, float64, optional
+ Source histogram (default is uniform weight)
+ b : (nt,) ndarray, float64, optional
+ Target histogram (default is uniform weight)
+ p: float, optional (default=1.0)
+ The order of the p-Wasserstein distance to be computed
+
+ Returns
+ -------
+ dist: float
+ p-Wasserstein distance
+
+
+ Examples
+ --------
+
+ Simple example with obvious solution. The function wasserstein_1d accepts
+ lists and performs automatic conversion to numpy arrays
+
+ >>> import ot
+ >>> a=[.5, .5]
+ >>> b=[.5, .5]
+ >>> x_a = [2., 0.]
+ >>> x_b = [0., 3.]
+ >>> ot.wasserstein_1d(x_a, x_b, a, b)
+ 0.5
+ >>> ot.wasserstein_1d(x_a, x_b)
+ 0.5
+
+ References
+ ----------
+
+ .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+ Transport", 2018.
+
+ See Also
+ --------
+ ot.lp.emd_1d : EMD for 1d distributions
+ """
+ cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+ dense=False, log=False)
+ return np.power(cost_emd, 1. / p)
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 83ee6aa..d345fd4 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -10,13 +10,19 @@ Cython linker with C solver
import numpy as np
cimport numpy as np
+from ..utils import dist
+
cimport cython
+cimport libc.math as math
import warnings
cdef extern from "EMD.h":
int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter)
+ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter)
cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED
@@ -34,9 +40,11 @@ def check_result(result_code):
return message
+
+
@cython.boundscheck(False)
@cython.wraparound(False)
-def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter):
+def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense):
"""
Solves the Earth Movers distance problem and returns the optimal transport matrix
@@ -55,33 +63,50 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
- M is the metric cost matrix
- a and b are the sample weights
+ .. warning::
+ Note that the M matrix needs to be a C-order :py.cls:`numpy.array`
+
+ .. warning::
+ The C++ solver discards all samples in the distributions with
+ zeros weights. This means that while the primal variable (transport
+ matrix) is exact, the solver only returns feasible dual potentials
+ on the samples with weights different from zero.
+
Parameters
----------
- a : (ns,) ndarray, float64
+ a : (ns,) numpy.ndarray, float64
source histogram
- b : (nt,) ndarray, float64
+ b : (nt,) numpy.ndarray, float64
target histogram
- M : (ns,nt) ndarray, float64
+ M : (ns,nt) numpy.ndarray, float64
loss matrix
max_iter : int
The maximum number of iterations before stopping the optimization
algorithm if it has not converged.
-
+ dense : bool
+ Return a sparse transport matrix if set to False
Returns
-------
- gamma: (ns x nt) ndarray
+ gamma: (ns x nt) numpy.ndarray
Optimal transportation matrix for the given parameters
"""
cdef int n1= M.shape[0]
cdef int n2= M.shape[1]
+ cdef int nmax=n1+n2-1
+ cdef int result_code = 0
+ cdef int nG=0
cdef double cost=0
- cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2])
cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1)
cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2)
+ cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0])
+
+ cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0)
+ cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int)
+ cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int)
if not len(a):
a=np.ones((n1,))/n1
@@ -89,7 +114,112 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
if not len(b):
b=np.ones((n2,))/n2
- # calling the function
- cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+ if dense:
+ # init OT matrix
+ G=np.zeros([n1, n2])
+
+ # calling the function
+ result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+
+ return G, cost, alpha, beta, result_code
+
+
+ else:
+
+ # init sparse OT matrix
+ Gv=np.zeros(nmax)
+ iG=np.zeros(nmax,dtype=np.int)
+ jG=np.zeros(nmax,dtype=np.int)
+
+
+ result_code = EMD_wrap_return_sparse(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <long*> iG.data, <long*> jG.data, <double*> Gv.data, <long*> &nG, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
- return G, cost, alpha, beta, result_code
+
+ return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code
+
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
+ np.ndarray[double, ndim=1, mode="c"] v_weights,
+ np.ndarray[double, ndim=1, mode="c"] u,
+ np.ndarray[double, ndim=1, mode="c"] v,
+ str metric='sqeuclidean',
+ double p=1.):
+ r"""
+ Solves the Earth Movers distance problem between sorted 1d measures and
+ returns the OT matrix and the associated cost
+
+ Parameters
+ ----------
+ u_weights : (ns,) ndarray, float64
+ Source histogram
+ v_weights : (nt,) ndarray, float64
+ Target histogram
+ u : (ns,) ndarray, float64
+ Source dirac locations (on the real line)
+ v : (nt,) ndarray, float64
+ Target dirac locations (on the real line)
+ metric: str, optional (default='sqeuclidean')
+ Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+ Due to implementation details, this function runs faster when
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
+
+ Returns
+ -------
+ gamma: (n, ) ndarray, float64
+ Values in the Optimal transportation matrix
+ indices: (n, 2) ndarray, int64
+ Indices of the values stored in gamma for the Optimal transportation
+ matrix
+ cost
+ cost associated to the optimal transportation
+ """
+ cdef double cost = 0.
+ cdef int n = u_weights.shape[0]
+ cdef int m = v_weights.shape[0]
+
+ cdef int i = 0
+ cdef double w_i = u_weights[0]
+ cdef int j = 0
+ cdef double w_j = v_weights[0]
+
+ cdef double m_ij = 0.
+
+ cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ),
+ dtype=np.float64)
+ cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
+ dtype=np.int)
+ cdef int cur_idx = 0
+ while i < n and j < m:
+ if metric == 'sqeuclidean':
+ m_ij = (u[i] - v[j]) * (u[i] - v[j])
+ elif metric == 'cityblock' or metric == 'euclidean':
+ m_ij = math.fabs(u[i] - v[j])
+ elif metric == 'minkowski':
+ m_ij = math.pow(math.fabs(u[i] - v[j]), p)
+ else:
+ m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)),
+ metric=metric)[0, 0]
+ if w_i < w_j or j == m - 1:
+ cost += m_ij * w_i
+ G[cur_idx] = w_i
+ indices[cur_idx, 0] = i
+ indices[cur_idx, 1] = j
+ i += 1
+ w_j -= w_i
+ w_i = u_weights[i]
+ else:
+ cost += m_ij * w_j
+ G[cur_idx] = w_j
+ indices[cur_idx, 0] = i
+ indices[cur_idx, 1] = j
+ j += 1
+ w_i -= w_j
+ w_j = v_weights[j]
+ cur_idx += 1
+ return G[:cur_idx], indices[:cur_idx], cost
diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h
index 7c6a4ce..498e921 100644
--- a/ot/lp/network_simplex_simple.h
+++ b/ot/lp/network_simplex_simple.h
@@ -686,7 +686,7 @@ namespace lemon {
/// \see resetParams(), reset()
ProblemType run() {
#if DEBUG_LVL>0
- std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n";
+ std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ;
#endif
if (!init()) return INFEASIBLE;