summaryrefslogtreecommitdiff
path: root/ot/lp
diff options
context:
space:
mode:
authorKilian <kilian.fatras@gmail.com>2020-01-07 13:16:54 +0100
committerGitHub <noreply@github.com>2020-01-07 13:16:54 +0100
commit40154746a5a6cab0b6f17c284967eb8303fcc3f6 (patch)
tree2622569a36aab5fc2eb2d2adc45a16f861a65dc5 /ot/lp
parenta9bbc2cfdffd22ceee3256102e470df6c25338f3 (diff)
parentc5039bcafde999114283f7e59fb03e176027d740 (diff)
Merge branch 'master' into emd_dimension
Diffstat (limited to 'ot/lp')
-rw-r--r--ot/lp/EMD.h5
-rw-r--r--ot/lp/EMD_wrapper.cpp191
-rw-r--r--ot/lp/__init__.py39
-rw-r--r--ot/lp/emd_wrap.pyx42
-rw-r--r--ot/lp/network_simplex_simple.h2
5 files changed, 261 insertions, 18 deletions
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index f42e222..2adaace 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -32,4 +32,9 @@ enum ProblemType {
int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter);
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter);
+
+
#endif
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index fc7ca63..28e4af2 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -17,13 +17,13 @@
int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
double* alpha, double* beta, double *cost, int maxIter) {
-// beware M and C anre strored in row major C style!!!
- int n, m, i, cur;
+ // beware M and C anre strored in row major C style!!!
+ int n, m, i, cur;
typedef FullBipartiteDigraph Digraph;
- DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
- // Get the number of non zero coordinates for r and c
+ // Get the number of non zero coordinates for r and c
n=0;
for (int i=0; i<n1; i++) {
double val=*(X+i);
@@ -105,3 +105,186 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
return ret;
}
+
+
+int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter) {
+ // beware M and C anre strored in row major C style!!!
+
+ // Get the number of non zero coordinates for r and c and vectors
+ int n, m, i, cur;
+
+ typedef FullBipartiteDigraph Digraph;
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+ // Get the number of non zero coordinates for r and c
+ n=0;
+ for (int i=0; i<n1; i++) {
+ double val=*(X+i);
+ if (val>0) {
+ n++;
+ }else if(val<0){
+ return INFEASIBLE;
+ }
+ }
+ m=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ m++;
+ }else if(val<0){
+ return INFEASIBLE;
+ }
+ }
+
+ // Define the graph
+
+ std::vector<int> indI(n), indJ(m);
+ std::vector<double> weights1(n), weights2(m);
+ Digraph di(n, m);
+ NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+ // Set supply and demand, don't account for 0 values (faster)
+
+ cur=0;
+ for (int i=0; i<n1; i++) {
+ double val=*(X+i);
+ if (val>0) {
+ weights1[ cur ] = val;
+ indI[cur++]=i;
+ }
+ }
+
+ // Demand is actually negative supply...
+
+ cur=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ weights2[ cur ] = -val;
+ indJ[cur++]=i;
+ }
+ }
+
+ // Define the graph
+ net.supplyMap(&weights1[0], n, &weights2[0], m);
+
+ // Set the cost of each edge
+ for (int i=0; i<n; i++) {
+ for (int j=0; j<m; j++) {
+ double val=*(D+indI[i]*n2+indJ[j]);
+ net.setCost(di.arcFromId(i*m+j), val);
+ }
+ }
+
+
+ // Solve the problem with the network simplex algorithm
+
+ int ret=net.run();
+ if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+ *cost = 0;
+ Arc a; di.first(a);
+ cur=0;
+ for (; a != INVALID; di.next(a)) {
+ int i = di.source(a);
+ int j = di.target(a);
+ double flow = net.flow(a);
+ if (flow>0)
+ {
+ *cost += flow * (*(D+indI[i]*n2+indJ[j-n]));
+
+ *(G+cur) = flow;
+ *(iG+cur) = indI[i];
+ *(jG+cur) = indJ[j-n];
+ *(alpha + indI[i]) = -net.potential(i);
+ *(beta + indJ[j-n]) = net.potential(j);
+ cur++;
+ }
+ }
+ *nG=cur; // nb of value +1 for numpy indexing
+
+ }
+
+
+ return ret;
+}
+
+int EMD_wrap_all_sparse(int n1, int n2, double *X, double *Y,
+ long *iD, long *jD, double *D, long nD,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter) {
+ // beware M and C anre strored in row major C style!!!
+
+ // Get the number of non zero coordinates for r and c and vectors
+ int n, m, cur;
+
+ typedef FullBipartiteDigraph Digraph;
+ DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+
+ n=n1;
+ m=n2;
+
+
+ // Define the graph
+
+
+ std::vector<double> weights2(m);
+ Digraph di(n, m);
+ NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+
+ // Set supply and demand, don't account for 0 values (faster)
+
+
+ // Demand is actually negative supply...
+
+ cur=0;
+ for (int i=0; i<n2; i++) {
+ double val=*(Y+i);
+ if (val>0) {
+ weights2[ cur ] = -val;
+ }
+ }
+
+ // Define the graph
+ net.supplyMap(X, n, &weights2[0], m);
+
+ // Set the cost of each edge
+ for (int k=0; k<nD; k++) {
+ int i = iD[k];
+ int j = jD[k];
+ net.setCost(di.arcFromId(i*m+j), D[k]);
+
+ }
+
+
+ // Solve the problem with the network simplex algorithm
+
+ int ret=net.run();
+ if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+ *cost = net.totalCost();
+ Arc a; di.first(a);
+ cur=0;
+ for (; a != INVALID; di.next(a)) {
+ int i = di.source(a);
+ int j = di.target(a);
+ double flow = net.flow(a);
+ if (flow>0)
+ {
+
+ *(G+cur) = flow;
+ *(iG+cur) = i;
+ *(jG+cur) = j-n;
+ *(alpha + i) = -net.potential(i);
+ *(beta + j-n) = net.potential(j);
+ cur++;
+ }
+ }
+ *nG=cur; // nb of value +1 for numpy indexing
+
+ }
+
+
+ return ret;
+}
+
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 4cce41c..eabdd3a 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -27,7 +27,7 @@ __all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
'emd_1d', 'emd2_1d', 'wasserstein_1d']
-def emd(a, b, M, numItermax=100000, log=False):
+def emd(a, b, M, numItermax=100000, log=False, dense=True):
r"""Solves the Earth Movers distance problem and returns the OT matrix
@@ -62,6 +62,10 @@ def emd(a, b, M, numItermax=100000, log=False):
log: bool, optional (default=False)
If True, returns a dictionary containing the cost and dual
variables. Otherwise returns only the optimal transportation matrix.
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format.
Returns
-------
@@ -103,6 +107,7 @@ def emd(a, b, M, numItermax=100000, log=False):
b = np.asarray(b, dtype=np.float64)
M = np.asarray(M, dtype=np.float64)
+
# if empty array given then use uniform distributions
if len(a) == 0:
a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
@@ -112,7 +117,12 @@ def emd(a, b, M, numItermax=100000, log=False):
assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
"Dimension mismatch, check dimensions of M with a and b"
- G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
result_code_string = check_result(result_code)
if log:
log = {}
@@ -126,7 +136,7 @@ def emd(a, b, M, numItermax=100000, log=False):
def emd2(a, b, M, processes=multiprocessing.cpu_count(),
- numItermax=100000, log=False, return_matrix=False):
+ numItermax=100000, log=False, dense=True, return_matrix=False):
r"""Solves the Earth Movers distance problem and returns the loss
.. math::
@@ -164,6 +174,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
variables. Otherwise returns only the optimal transportation cost.
return_matrix: boolean, optional (default=False)
If True, returns the optimal transportation matrix in the log.
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format.
Returns
-------
@@ -220,19 +234,30 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
if log or return_matrix:
def f(b):
- G, cost, u, v, resultCode = emd_c(a, b, M, numItermax)
- result_code_string = check_result(resultCode)
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+ result_code_string = check_result(result_code)
log = {}
if return_matrix:
log['G'] = G
log['u'] = u
log['v'] = v
log['warning'] = result_code_string
- log['result_code'] = resultCode
+ log['result_code'] = result_code
return [cost, log]
else:
def f(b):
- G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+ if dense:
+ G, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ else:
+ Gv, iG, jG, cost, u, v, result_code = emd_c(a, b, M, numItermax,dense)
+ G = coo_matrix((Gv, (iG, jG)), shape=(a.shape[0], b.shape[0]))
+
+ result_code_string = check_result(result_code)
check_result(result_code)
return cost
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 2b6c495..c0d7128 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -20,6 +20,9 @@ import warnings
cdef extern from "EMD.h":
int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter)
+ int EMD_wrap_return_sparse(int n1, int n2, double *X, double *Y, double *D,
+ long *iG, long *jG, double *G, long * nG,
+ double* alpha, double* beta, double *cost, int maxIter)
cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED
@@ -39,7 +42,7 @@ def check_result(result_code):
@cython.boundscheck(False)
@cython.wraparound(False)
-def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter):
+def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter, bint dense):
"""
Solves the Earth Movers distance problem and returns the optimal transport matrix
@@ -72,7 +75,8 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
max_iter : int
The maximum number of iterations before stopping the optimization
algorithm if it has not converged.
-
+ dense : bool
+ Return a sparse transport matrix if set to False
Returns
-------
@@ -82,12 +86,19 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
"""
cdef int n1= M.shape[0]
cdef int n2= M.shape[1]
+ cdef int nmax=n1+n2-1
+ cdef int result_code = 0
+ cdef int nG=0
cdef double cost=0
- cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2])
cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1)
cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2)
+ cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0])
+
+ cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0)
+ cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int)
+ cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int)
if not len(a):
a=np.ones((n1,))/n1
@@ -95,10 +106,29 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
if not len(b):
b=np.ones((n2,))/n2
- # calling the function
- cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+ if dense:
+ # init OT matrix
+ G=np.zeros([n1, n2])
+
+ # calling the function
+ result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+
+ return G, cost, alpha, beta, result_code
+
+
+ else:
+
+ # init sparse OT matrix
+ Gv=np.zeros(nmax)
+ iG=np.zeros(nmax,dtype=np.int)
+ jG=np.zeros(nmax,dtype=np.int)
+
+
+ result_code = EMD_wrap_return_sparse(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <long*> iG.data, <long*> jG.data, <double*> Gv.data, <long*> &nG, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+
+
+ return Gv[:nG], iG[:nG], jG[:nG], cost, alpha, beta, result_code
- return G, cost, alpha, beta, result_code
@cython.boundscheck(False)
diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h
index 7c6a4ce..498e921 100644
--- a/ot/lp/network_simplex_simple.h
+++ b/ot/lp/network_simplex_simple.h
@@ -686,7 +686,7 @@ namespace lemon {
/// \see resetParams(), reset()
ProblemType run() {
#if DEBUG_LVL>0
- std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n";
+ std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ;
#endif
if (!init()) return INFEASIBLE;