Merge tag '0.8.0' into dfsg/latest

author: Gard Spreemann <gspr@nonempty.org> 2021-11-09 17:05:13 +0100
committer: Gard Spreemann <gspr@nonempty.org> 2021-11-09 17:05:13 +0100
commit: a9fdc844907decddf54bed3ebeea8d8b2cf0fc5c (patch)
tree: 449a03fce8fafb78b6badd12b6e633f1e5d73a64 /ot/lp
parent: a16b9471d7114ec08977479b7249efe747702b97 (diff)
parent: f1628794d521a8dfa00af383b5e06cd6d34af619 (diff)
10 files changed, 2763 insertions, 537 deletions
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h
index c0fe7a3..8a1f9ac 100644
--- a/ot/lp/EMD.h
+++ b/ot/lp/EMD.h
@@ -18,19 +18,18 @@
 
 #include <iostream>
 #include <vector>
-#include "network_simplex_simple.h"
 
-using namespace lemon;
 typedef unsigned int node_id_type;
 
 enum ProblemType {
     INFEASIBLE,
     OPTIMAL,
     UNBOUNDED,
-	MAX_ITER_REACHED
+    MAX_ITER_REACHED
 };
 
 int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter);
+int EMD_wrap_omp(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter, int numThreads);
 
 
 
diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp
index bc873ed..2bdc172 100644
--- a/ot/lp/EMD_wrapper.cpp
+++ b/ot/lp/EMD_wrapper.cpp
@@ -12,16 +12,22 @@
  *
  */
 
+
+#include "network_simplex_simple.h"
+#include "network_simplex_simple_omp.h"
 #include "EMD.h"
+#include <cstdint>
 
 
 int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
                 double* alpha, double* beta, double *cost, int maxIter)  {
-    // beware M and C anre strored in row major C style!!!
-     int n, m, i, cur;
+    // beware M and C are stored in row major C style!!!
+
+    using namespace lemon;
+    int n, m, cur;
 
     typedef FullBipartiteDigraph Digraph;
-    DIGRAPH_TYPEDEFS(FullBipartiteDigraph);
+    DIGRAPH_TYPEDEFS(Digraph);
 
     // Get the number of non zero coordinates for r and c
     n=0;
@@ -48,7 +54,7 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     std::vector<int> indI(n), indJ(m);
     std::vector<double> weights1(n), weights2(m);
     Digraph di(n, m);
-    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, n*m, maxIter);
+    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, ((int64_t)n)*((int64_t)m), maxIter);
 
     // Set supply and demand, don't account for 0 values (faster)
 
@@ -76,10 +82,12 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     net.supplyMap(&weights1[0], n, &weights2[0], m);
 
     // Set the cost of each edge
+    int64_t idarc = 0;
     for (int i=0; i<n; i++) {
         for (int j=0; j<m; j++) {
             double val=*(D+indI[i]*n2+indJ[j]);
-            net.setCost(di.arcFromId(i*m+j), val);
+            net.setCost(di.arcFromId(idarc), val);
+            ++idarc;
         }
     }
 
@@ -87,12 +95,13 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     // Solve the problem with the network simplex algorithm
 
     int ret=net.run();
+    int i, j;
     if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
         *cost = 0;
         Arc a; di.first(a);
         for (; a != INVALID; di.next(a)) {
-            int i = di.source(a);
-            int j = di.target(a);
+            i = di.source(a);
+            j = di.target(a);
             double flow = net.flow(a);
             *cost += flow * (*(D+indI[i]*n2+indJ[j-n]));
             *(G+indI[i]*n2+indJ[j-n]) = flow;
@@ -106,3 +115,104 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G,
     return ret;
 }
 
+
+
+
+
+
+
+int EMD_wrap_omp(int n1, int n2, double *X, double *Y, double *D, double *G,
+             double* alpha, double* beta, double *cost, int maxIter, int numThreads)  {
+    // beware M and C are stored in row major C style!!!
+
+    using namespace lemon_omp;
+    int n, m, cur;
+
+    typedef FullBipartiteDigraph Digraph;
+    DIGRAPH_TYPEDEFS(Digraph);
+
+    // Get the number of non zero coordinates for r and c
+    n=0;
+    for (int i=0; i<n1; i++) {
+        double val=*(X+i);
+        if (val>0) {
+            n++;
+        }else if(val<0){
+            return INFEASIBLE;
+        }
+    }
+    m=0;
+    for (int i=0; i<n2; i++) {
+        double val=*(Y+i);
+        if (val>0) {
+            m++;
+        }else if(val<0){
+            return INFEASIBLE;
+        }
+    }
+
+    // Define the graph
+
+    std::vector<int> indI(n), indJ(m);
+    std::vector<double> weights1(n), weights2(m);
+    Digraph di(n, m);
+    NetworkSimplexSimple<Digraph,double,double, node_id_type> net(di, true, n+m, ((int64_t)n)*((int64_t)m), maxIter, numThreads);
+
+    // Set supply and demand, don't account for 0 values (faster)
+
+    cur=0;
+    for (int i=0; i<n1; i++) {
+        double val=*(X+i);
+        if (val>0) {
+            weights1[ cur ] = val;
+            indI[cur++]=i;
+        }
+    }
+
+    // Demand is actually negative supply...
+
+    cur=0;
+    for (int i=0; i<n2; i++) {
+        double val=*(Y+i);
+        if (val>0) {
+            weights2[ cur ] = -val;
+            indJ[cur++]=i;
+        }
+    }
+
+
+    net.supplyMap(&weights1[0], n, &weights2[0], m);
+
+    // Set the cost of each edge
+    int64_t idarc = 0;
+    for (int i=0; i<n; i++) {
+        for (int j=0; j<m; j++) {
+            double val=*(D+indI[i]*n2+indJ[j]);
+            net.setCost(di.arcFromId(idarc), val);
+            ++idarc;
+        }
+    }
+
+
+    // Solve the problem with the network simplex algorithm
+
+    int ret=net.run();
+    int i, j;
+    if (ret==(int)net.OPTIMAL || ret==(int)net.MAX_ITER_REACHED) {
+        *cost = 0;
+        Arc a; di.first(a);
+        for (; a != INVALID; di.next(a)) {
+            i = di.source(a);
+            j = di.target(a);
+            double flow = net.flow(a);
+            *cost += flow * (*(D+indI[i]*n2+indJ[j-n]));
+            *(G+indI[i]*n2+indJ[j-n]) = flow;
+            *(alpha + indI[i]) = -net.potential(i);
+            *(beta + indJ[j-n]) = net.potential(j);
+        }
+
+    }
+
+
+    return ret;
+}
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 514a607..5da897d 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -8,25 +8,50 @@ Solvers for the original linear program OT problem
 #
 # License: MIT License
 
+import os
 import multiprocessing
 import sys
 
 import numpy as np
-from scipy.sparse import coo_matrix
+import warnings
 
 from . import cvx
 from .cvx import barycenter
+
 # import compiled emd
 from .emd_wrap import emd_c, check_result, emd_1d_sorted
-from ..utils import dist
+from .solver_1d import emd_1d, emd2_1d, wasserstein_1d
+
+from ..utils import dist, list_to_array
 from ..utils import parmap
+from ..backend import get_backend
 
-__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', ' emd_1d_sorted',
            'emd_1d', 'emd2_1d', 'wasserstein_1d']
 
 
+def check_number_threads(numThreads):
+    """Checks whether or not the requested number of threads has a valid value.
+
+    Parameters
+    ----------
+    numThreads : int or str
+        The requested number of threads, should either be a strictly positive integer or "max" or None
+
+    Returns
+    -------
+    numThreads : int
+        Corrected number of threads
+    """
+    if (numThreads is None) or (isinstance(numThreads, str) and numThreads.lower() == 'max'):
+        return -1
+    if (not isinstance(numThreads, int)) or numThreads < 1:
+        raise ValueError('numThreads should either be "max" or a strictly positive integer')
+    return numThreads
+
+
 def center_ot_dual(alpha0, beta0, a=None, b=None):
-    r"""Center dual OT potentials w.r.t. theirs weights
+    r"""Center dual OT potentials w.r.t. their weights
 
     The main idea of this function is to find unique dual potentials
     that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having
@@ -37,7 +62,7 @@ def center_ot_dual(alpha0, beta0, a=None, b=None):
     is the following:
 
     .. math::
-        \alpha^T a= \beta^T b
+        \alpha^T \mathbf{a} = \beta^T \mathbf{b}
 
     in addition to the OT problem constraints.
 
@@ -45,11 +70,11 @@ def center_ot_dual(alpha0, beta0, a=None, b=None):
     a constant from both  :math:`\alpha_0` and :math:`\beta_0`.
 
     .. math::
-        c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta}
+        c &= \frac{\beta_0^T \mathbf{b} - \alpha_0^T \mathbf{a}}{\mathbf{1}^T \mathbf{b} + \mathbf{1}^T \mathbf{a}}
 
-        \alpha=\alpha_0+c
+        \alpha &= \alpha_0 + c
 
-        \beta=\beta0+c
+        \beta &= \beta_0 + c
 
     Parameters
     ----------
@@ -92,35 +117,35 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M):
     The feasible values are computed efficiently but rather coarsely.
 
     .. warning::
-        This function is necessary because the C++ solver in emd_c
-        discards all samples in the distributions with 
-        zeros weights. This means that while the primal variable (transport 
+        This function is necessary because the C++ solver in `emd_c`
+        discards all samples in the distributions with
+        zeros weights. This means that while the primal variable (transport
         matrix) is exact, the solver only returns feasible dual potentials
-        on the samples with weights different from zero. 
+        on the samples with weights different from zero.
 
     First we compute the constraints violations:
 
     .. math::
-        V=\alpha+\beta^T-M
+        \mathbf{V} = \alpha + \beta^T - \mathbf{M}
 
-    Next we compute the max amount of violation per row (alpha) and
-    columns (beta)
+    Next we compute the max amount of violation per row (:math:`\alpha`) and
+    columns (:math:`beta`)
 
     .. math::
-        v^a_i=\max_j V_{i,j}
+        \mathbf{v^a}_i = \max_j \mathbf{V}_{i,j}
 
-        v^b_j=\max_i V_{i,j}
+        \mathbf{v^b}_j = \max_i \mathbf{V}_{i,j}
 
     Finally we update the dual potential with 0 weights if a
     constraint is violated
 
     .. math::
-        \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0
+        \alpha_i = \alpha_i - \mathbf{v^a}_i \quad \text{ if } \mathbf{a}_i=0 \text{ and } \mathbf{v^a}_i>0
 
-        \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0
+        \beta_j = \beta_j - \mathbf{v^b}_j \quad \text{ if } \mathbf{b}_j=0 \text{ and } \mathbf{v^b}_j > 0
 
     In the end the dual potentials are centered using function
-    :ref:`center_ot_dual`.
+    :py:func:`ot.lp.center_ot_dual`.
 
     Note that all those updates do not change the objective value of the
     solution but provide dual potentials that do not violate the constraints.
@@ -172,54 +197,62 @@ def estimate_dual_null_weights(alpha0, beta0, a, b, M):
     return center_ot_dual(alpha, beta, a, b)
 
 
-def emd(a, b, M, numItermax=100000, log=False, center_dual=True):
+def emd(a, b, M, numItermax=100000, log=False, center_dual=True, numThreads=1):
     r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
     .. math::
-        \gamma = arg\min_\gamma <\gamma,M>_F
+        \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
 
-        s.t. \gamma 1 = a
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-             \gamma^T 1= b
+             \gamma^T \mathbf{1} = \mathbf{b}
+
+             \gamma \geq 0
 
-             \gamma\geq 0
     where :
 
-    - M is the metric cost matrix
-    - a and b are the sample weights
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
 
-    .. warning::
-        Note that the M matrix needs to be a C-order numpy.array in float64
-        format.
+    .. warning:: Note that the :math:`\mathbf{M}` matrix in numpy needs to be a C-order
+        numpy.array in float64 format. It will be converted if not in this
+        format
+
+    .. note:: This function is backend-compatible and will work on arrays
+        from all compatible backends.
 
-    Uses the algorithm proposed in [1]_
+    Uses the algorithm proposed in :ref:`[1] <references-emd>`.
 
     Parameters
     ----------
-    a : (ns,) numpy.ndarray, float64
+    a : (ns,) array-like, float
         Source histogram (uniform weight if empty list)
-    b : (nt,) numpy.ndarray, float64
+    b : (nt,) array-like, float
         Target histogram (uniform weight if empty list)
-    M : (ns,nt) numpy.ndarray, float64
-        Loss matrix (c-order array with type float64)
+    M : (ns,nt) array-like, float
+        Loss matrix (c-order array in numpy with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
     log: bool, optional (default=False)
-        If True, returns a dictionary containing the cost and dual
-        variables. Otherwise returns only the optimal transportation matrix.
+        If True, returns a dictionary containing the cost and dual variables.
+        Otherwise returns only the optimal transportation matrix.
     center_dual: boolean, optional (default=True)
         If True, centers the dual potential using function
         :ref:`center_ot_dual`.
+    numThreads: int or "max", optional (default=1, i.e. OpenMP is not used)
+        If compiled with OpenMP, chooses the number of threads to parallelize.
+        "max" selects the highest number possible.
 
     Returns
     -------
-    gamma: (ns x nt) numpy.ndarray
-        Optimal transportation matrix for the given parameters
-    log: dict
-        If input log is true, a dictionary containing the cost and dual
-        variables and exit status
+    gamma: array-like, shape (ns, nt)
+        Optimal transportation matrix for the given
+        parameters
+    log: dict, optional
+        If input log is true, a dictionary containing the
+        cost and dual variables and exit status
 
 
     Examples
@@ -232,26 +265,39 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True):
     >>> a=[.5,.5]
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
-    >>> ot.emd(a,b,M)
+    >>> ot.emd(a, b, M)
     array([[0.5, 0. ],
            [0. , 0.5]])
 
+
+    .. _references-emd:
     References
     ----------
-
-    .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W.
-        (2011, December).  Displacement interpolation using Lagrangian mass
-        transport. In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p.
-        158). ACM.
+    .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W. (2011,
+        December).  Displacement interpolation using Lagrangian mass transport.
+        In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p. 158). ACM.
 
     See Also
     --------
     ot.bregman.sinkhorn : Entropic regularized OT
-    ot.optim.cg : General regularized OT"""
+    ot.optim.cg : General regularized OT
+    """
+
+    # convert to numpy if list
+    a, b, M = list_to_array(a, b, M)
+
+    a0, b0, M0 = a, b, M
+    nx = get_backend(M0, a0, b0)
 
+    # convert to numpy
+    M = nx.to_numpy(M)
+    a = nx.to_numpy(a)
+    b = nx.to_numpy(b)
+
+    # ensure float64
     a = np.asarray(a, dtype=np.float64)
     b = np.asarray(b, dtype=np.float64)
-    M = np.asarray(M, dtype=np.float64)
+    M = np.asarray(M, dtype=np.float64, order='C')
 
     # if empty array given then use uniform distributions
     if len(a) == 0:
@@ -262,81 +308,91 @@ def emd(a, b, M, numItermax=100000, log=False, center_dual=True):
     assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \
         "Dimension mismatch, check dimensions of M with a and b"
 
+    # ensure that same mass
+    np.testing.assert_almost_equal(a.sum(0),
+                                   b.sum(0), err_msg='a and b vector must have the same sum')
+    b = b * a.sum() / b.sum()
+
     asel = a != 0
     bsel = b != 0
 
-    G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+    numThreads = check_number_threads(numThreads)
+
+    G, cost, u, v, result_code = emd_c(a, b, M, numItermax, numThreads)
 
     if center_dual:
         u, v = center_ot_dual(u, v, a, b)
 
     if np.any(~asel) or np.any(~bsel):
         u, v = estimate_dual_null_weights(u, v, a, b, M)
-    
+
     result_code_string = check_result(result_code)
     if log:
         log = {}
         log['cost'] = cost
-        log['u'] = u
-        log['v'] = v
+        log['u'] = nx.from_numpy(u, type_as=a0)
+        log['v'] = nx.from_numpy(v, type_as=b0)
         log['warning'] = result_code_string
         log['result_code'] = result_code
-        return G, log
-    return G
+        return nx.from_numpy(G, type_as=M0), log
+    return nx.from_numpy(G, type_as=M0)
 
 
-def emd2(a, b, M, processes=multiprocessing.cpu_count(),
+def emd2(a, b, M, processes=1,
          numItermax=100000, log=False, return_matrix=False,
-         center_dual=True):
+         center_dual=True, numThreads=1):
     r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
-        \min_\gamma <\gamma,M>_F
+        \min_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F
+
+        s.t. \ \gamma \mathbf{1} = \mathbf{a}
 
-        s.t. \gamma 1 = a
+             \gamma^T \mathbf{1} = \mathbf{b}
 
-             \gamma^T 1= b
+             \gamma \geq 0
 
-             \gamma\geq 0
     where :
 
-    - M is the metric cost matrix
-    - a and b are the sample weights
+    - :math:`\mathbf{M}` is the metric cost matrix
+    - :math:`\mathbf{a}` and :math:`\mathbf{b}` are the sample weights
 
-    .. warning::
-        Note that the M matrix needs to be a C-order numpy.array in float64
-        format.
+    .. note:: This function is backend-compatible and will work on arrays
+        from all compatible backends.
 
-    Uses the algorithm proposed in [1]_
+    Uses the algorithm proposed in :ref:`[1] <references-emd2>`.
 
     Parameters
     ----------
-    a : (ns,) numpy.ndarray, float64
+    a : (ns,) array-like, float64
         Source histogram (uniform weight if empty list)
-    b : (nt,) numpy.ndarray, float64
+    b : (nt,) array-like, float64
         Target histogram (uniform weight if empty list)
-    M : (ns,nt) numpy.ndarray, float64
-        Loss matrix (c-order array with type float64)
-    processes : int, optional (default=nb cpu)
-        Nb of processes used for multiple emd computation (not used on windows)
+    M : (ns,nt) array-like, float64
+        Loss matrix (for numpy c-order array with type float64)
+    processes : int, optional (default=1)
+        Nb of processes used for multiple emd computation (deprecated)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
     log: boolean, optional (default=False)
-        If True, returns a dictionary containing the cost and dual
+        If True, returns a dictionary containing dual
         variables. Otherwise returns only the optimal transportation cost.
     return_matrix: boolean, optional (default=False)
         If True, returns the optimal transportation matrix in the log.
     center_dual: boolean, optional (default=True)
         If True, centers the dual potential using function
         :ref:`center_ot_dual`.
+    numThreads: int or "max", optional (default=1, i.e. OpenMP is not used)
+        If compiled with OpenMP, chooses the number of threads to parallelize.
+        "max" selects the highest number possible.
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
-        Optimal transportation matrix for the given parameters
-    log: dictnp
-        If input log is true, a dictionary containing the cost and dual
+    W: float, array-like
+        Optimal transportation loss for the given parameters
+    log: dict
+        If input log is true, a dictionary containing dual
         variables and exit status
 
 
@@ -354,9 +410,10 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     >>> ot.emd2(a,b,M)
     0.0
 
+
+    .. _references-emd2:
     References
     ----------
-
     .. [1] Bonneel, N., Van De Panne, M., Paris, S., & Heidrich, W.
         (2011, December).  Displacement interpolation using Lagrangian mass
         transport. In ACM Transactions on Graphics (TOG) (Vol. 30, No. 6, p.
@@ -365,15 +422,22 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     See Also
     --------
     ot.bregman.sinkhorn : Entropic regularized OT
-    ot.optim.cg : General regularized OT"""
+    ot.optim.cg : General regularized OT
+    """
+
+    a, b, M = list_to_array(a, b, M)
+
+    a0, b0, M0 = a, b, M
+    nx = get_backend(M0, a0, b0)
+
+    # convert to numpy
+    M = nx.to_numpy(M)
+    a = nx.to_numpy(a)
+    b = nx.to_numpy(b)
 
     a = np.asarray(a, dtype=np.float64)
     b = np.asarray(b, dtype=np.float64)
-    M = np.asarray(M, dtype=np.float64)
-
-    # problem with pikling Forks
-    if sys.platform.endswith('win32'):
-        processes = 1
+    M = np.asarray(M, dtype=np.float64, order='C')
 
     # if empty array given then use uniform distributions
     if len(a) == 0:
@@ -386,11 +450,13 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
 
     asel = a != 0
 
+    numThreads = check_number_threads(numThreads)
+
     if log or return_matrix:
         def f(b):
             bsel = b != 0
-            
-            G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+
+            G, cost, u, v, result_code = emd_c(a, b, M, numItermax, numThreads)
 
             if center_dual:
                 u, v = center_ot_dual(u, v, a, b)
@@ -400,17 +466,20 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
 
             result_code_string = check_result(result_code)
             log = {}
+            G = nx.from_numpy(G, type_as=M0)
             if return_matrix:
                 log['G'] = G
-            log['u'] = u
-            log['v'] = v
+            log['u'] = nx.from_numpy(u, type_as=a0)
+            log['v'] = nx.from_numpy(v, type_as=b0)
             log['warning'] = result_code_string
             log['result_code'] = result_code
+            cost = nx.set_gradients(nx.from_numpy(cost, type_as=M0),
+                                    (a0, b0, M0), (log['u'], log['v'], G))
             return [cost, log]
     else:
         def f(b):
             bsel = b != 0
-            G, cost, u, v, result_code = emd_c(a, b, M, numItermax)
+            G, cost, u, v, result_code = emd_c(a, b, M, numItermax, numThreads)
 
             if center_dual:
                 u, v = center_ot_dual(u, v, a, b)
@@ -418,6 +487,11 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
             if np.any(~asel) or np.any(~bsel):
                 u, v = estimate_dual_null_weights(u, v, a, b, M)
 
+            G = nx.from_numpy(G, type_as=M0)
+            cost = nx.set_gradients(nx.from_numpy(cost, type_as=M0),
+                                    (a0, b0, M0), (nx.from_numpy(u, type_as=a0),
+                                                   nx.from_numpy(v, type_as=b0), G))
+
             check_result(result_code)
             return cost
 
@@ -426,35 +500,53 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     nb = b.shape[1]
 
     if processes > 1:
-        res = parmap(f, [b[:, i] for i in range(nb)], processes)
-    else:
-        res = list(map(f, [b[:, i].copy() for i in range(nb)]))
+        warnings.warn(
+            "The 'processes' parameter has been deprecated. "
+            "Multiprocessing should be done outside of POT."
+        )
+    res = list(map(f, [b[:, i].copy() for i in range(nb)]))
 
     return res
 
 
 def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100,
-                            stopThr=1e-7, verbose=False, log=None):
-    """
-    Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance)
+                            stopThr=1e-7, verbose=False, log=None, numThreads=1):
+    r"""
+    Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance), formally:
+
+    .. math::
+        \min_\mathbf{X} \quad \sum_{i=1}^N w_i W_2^2(\mathbf{b}, \mathbf{X}, \mathbf{a}_i, \mathbf{X}_i)
+
+    where :
+
+    - :math:`w \in \mathbb{(0, 1)}^{N}`'s are the barycenter weights and sum to one
+    - the :math:`\mathbf{a}_i \in \mathbb{R}^{k_i}` are the empirical measures weights and sum to one for each :math:`i`
+    - the :math:`\mathbf{X}_i \in \mathbb{R}^{k_i, d}` are the empirical measures atoms locations
+    - :math:`\mathbf{b} \in \mathbb{R}^{k}` is the desired weights vector of the barycenter
+
+    This problem is considered in :ref:`[1] <references-free-support-barycenter>` (Algorithm 2).
+    There are two differences with the following codes:
 
-    The function solves the Wasserstein barycenter problem when the barycenter measure is constrained to be supported on k atoms.
-    This problem is considered in [1] (Algorithm 2). There are two differences with the following codes:
     - we do not optimize over the weights
-    - we do not do line search for the locations updates, we use i.e. theta = 1 in [1] (Algorithm 2). This can be seen as a discrete implementation of the fixed-point algorithm of [2] proposed in the continuous setting.
+    - we do not do line search for the locations updates, we use i.e. :math:`\theta = 1` in
+      :ref:`[1] <references-free-support-barycenter>` (Algorithm 2). This can be seen as a discrete
+      implementation of the fixed-point algorithm of
+      :ref:`[2] <references-free-support-barycenter>` proposed in the continuous setting.
 
     Parameters
     ----------
-    measures_locations : list of (k_i,d) numpy.ndarray
-        The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
-    measures_weights : list of (k_i,) numpy.ndarray
-        Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
+    measures_locations : list of N (k_i,d) numpy.ndarray
+        The discrete support of a measure supported on :math:`k_i` locations of a `d`-dimensional space
+        (:math:`k_i` can be different for each element of the list)
+    measures_weights : list of N (k_i,) numpy.ndarray
+        Numpy arrays where each numpy array has :math:`k_i` non-negatives values summing to one
+        representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
-        Initialization of the support locations (on k atoms) of the barycenter
+        Initialization of the support locations (on `k` atoms) of the barycenter
     b : (k,) np.ndarray
         Initialization of the weights of the barycenter (non-negatives, sum to 1)
-    weights : (k,) np.ndarray
+    weights : (N,) np.ndarray
         Initialization of the coefficients of the barycenter (non-negatives, sum to 1)
 
     numItermax : int, optional
@@ -465,15 +557,20 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         Print information along iterations
     log : bool, optional
         record log if True
+    numThreads: int or "max", optional (default=1, i.e. OpenMP is not used)
+        If compiled with OpenMP, chooses the number of threads to parallelize.
+        "max" selects the highest number possible.
+
 
     Returns
     -------
     X : (k,d) np.ndarray
         Support locations (on k atoms) of the barycenter
 
+
+    .. _references-free-support-barycenter:
     References
     ----------
-
     .. [1] Cuturi, Marco, and Arnaud Doucet. "Fast computation of Wasserstein barycenters." International Conference on Machine Learning. 2014.
 
     .. [2]  Álvarez-Esteban, Pedro C., et al. "A fixed-point approach to barycenters in Wasserstein space." Journal of Mathematical Analysis and Applications 441.2 (2016): 744-762.
@@ -504,7 +601,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights,
                                                                       weights.tolist()):
             M_i = dist(X, measure_locations_i)
-            T_i = emd(b, measure_weights_i, M_i)
+            T_i = emd(b, measure_weights_i, M_i, numThreads=numThreads)
             T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i)
 
         displacement_square_norm = np.sum(np.square(T_sum - X))
@@ -523,287 +620,3 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         return X, log_dict
     else:
         return X
-
-
-def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
-           log=False):
-    r"""Solves the Earth Movers distance problem between 1d measures and returns
-    the OT matrix
-
-
-    .. math::
-        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
-
-        s.t. \gamma 1 = a,
-             \gamma^T 1= b,
-             \gamma\geq 0
-    where :
-
-    - d is the metric
-    - x_a and x_b are the samples
-    - a and b are the sample weights
-
-    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
-
-    Uses the algorithm detailed in [1]_
-
-    Parameters
-    ----------
-    x_a : (ns,) or (ns, 1) ndarray, float64
-        Source dirac locations (on the real line)
-    x_b : (nt,) or (ns, 1) ndarray, float64
-        Target dirac locations (on the real line)
-    a : (ns,) ndarray, float64, optional
-        Source histogram (default is uniform weight)
-    b : (nt,) ndarray, float64, optional
-        Target histogram (default is uniform weight)
-    metric: str, optional (default='sqeuclidean')
-        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
-        Due to implementation details, this function runs faster when
-        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
-    p: float, optional (default=1.0)
-         The p-norm to apply for if metric='minkowski'
-    dense: boolean, optional (default=True)
-        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
-        Otherwise returns a sparse representation using scipy's `coo_matrix`
-        format. Due to implementation details, this function runs faster when
-        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
-        are used.
-    log: boolean, optional (default=False)
-        If True, returns a dictionary containing the cost.
-        Otherwise returns only the optimal transportation matrix.
-
-    Returns
-    -------
-    gamma: (ns, nt) ndarray
-        Optimal transportation matrix for the given parameters
-    log: dict
-        If input log is True, a dictionary containing the cost
-
-
-    Examples
-    --------
-
-    Simple example with obvious solution. The function emd_1d accepts lists and
-    performs automatic conversion to numpy arrays
-
-    >>> import ot
-    >>> a=[.5, .5]
-    >>> b=[.5, .5]
-    >>> x_a = [2., 0.]
-    >>> x_b = [0., 3.]
-    >>> ot.emd_1d(x_a, x_b, a, b)
-    array([[0. , 0.5],
-           [0.5, 0. ]])
-    >>> ot.emd_1d(x_a, x_b)
-    array([[0. , 0.5],
-           [0.5, 0. ]])
-
-    References
-    ----------
-
-    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
-        Transport", 2018.
-
-    See Also
-    --------
-    ot.lp.emd : EMD for multidimensional distributions
-    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
-        transportation matrix)
-    """
-    a = np.asarray(a, dtype=np.float64)
-    b = np.asarray(b, dtype=np.float64)
-    x_a = np.asarray(x_a, dtype=np.float64)
-    x_b = np.asarray(x_b, dtype=np.float64)
-
-    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
-        "emd_1d should only be used with monodimensional data"
-    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
-        "emd_1d should only be used with monodimensional data"
-
-    # if empty array given then use uniform distributions
-    if a.ndim == 0 or len(a) == 0:
-        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
-    if b.ndim == 0 or len(b) == 0:
-        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
-
-    x_a_1d = x_a.reshape((-1,))
-    x_b_1d = x_b.reshape((-1,))
-    perm_a = np.argsort(x_a_1d)
-    perm_b = np.argsort(x_b_1d)
-
-    G_sorted, indices, cost = emd_1d_sorted(a[perm_a], b[perm_b],
-                                            x_a_1d[perm_a], x_b_1d[perm_b],
-                                            metric=metric, p=p)
-    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
-                   shape=(a.shape[0], b.shape[0]))
-    if dense:
-        G = G.toarray()
-    if log:
-        log = {'cost': cost}
-        return G, log
-    return G
-
-
-def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
-            log=False):
-    r"""Solves the Earth Movers distance problem between 1d measures and returns
-    the loss
-
-
-    .. math::
-        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
-
-        s.t. \gamma 1 = a,
-             \gamma^T 1= b,
-             \gamma\geq 0
-    where :
-
-    - d is the metric
-    - x_a and x_b are the samples
-    - a and b are the sample weights
-
-    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
-
-    Uses the algorithm detailed in [1]_
-
-    Parameters
-    ----------
-    x_a : (ns,) or (ns, 1) ndarray, float64
-        Source dirac locations (on the real line)
-    x_b : (nt,) or (ns, 1) ndarray, float64
-        Target dirac locations (on the real line)
-    a : (ns,) ndarray, float64, optional
-        Source histogram (default is uniform weight)
-    b : (nt,) ndarray, float64, optional
-        Target histogram (default is uniform weight)
-    metric: str, optional (default='sqeuclidean')
-        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
-        Due to implementation details, this function runs faster when
-        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
-        are used.
-    p: float, optional (default=1.0)
-         The p-norm to apply for if metric='minkowski'
-    dense: boolean, optional (default=True)
-        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
-        Otherwise returns a sparse representation using scipy's `coo_matrix`
-        format. Only used if log is set to True. Due to implementation details,
-        this function runs faster when dense is set to False.
-    log: boolean, optional (default=False)
-        If True, returns a dictionary containing the transportation matrix.
-        Otherwise returns only the loss.
-
-    Returns
-    -------
-    loss: float
-        Cost associated to the optimal transportation
-    log: dict
-        If input log is True, a dictionary containing the Optimal transportation
-        matrix for the given parameters
-
-
-    Examples
-    --------
-
-    Simple example with obvious solution. The function emd2_1d accepts lists and
-    performs automatic conversion to numpy arrays
-
-    >>> import ot
-    >>> a=[.5, .5]
-    >>> b=[.5, .5]
-    >>> x_a = [2., 0.]
-    >>> x_b = [0., 3.]
-    >>> ot.emd2_1d(x_a, x_b, a, b)
-    0.5
-    >>> ot.emd2_1d(x_a, x_b)
-    0.5
-
-    References
-    ----------
-
-    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
-        Transport", 2018.
-
-    See Also
-    --------
-    ot.lp.emd2 : EMD for multidimensional distributions
-    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
-        instead of the cost)
-    """
-    # If we do not return G (log==False), then we should not to cast it to dense
-    # (useless overhead)
-    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
-                        dense=dense and log, log=True)
-    cost = log_emd['cost']
-    if log:
-        log_emd = {'G': G}
-        return cost, log_emd
-    return cost
-
-
-def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
-    r"""Solves the p-Wasserstein distance problem between 1d measures and returns
-    the distance
-
-    .. math::
-        \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
-
-        s.t. \gamma 1 = a,
-             \gamma^T 1= b,
-             \gamma\geq 0
-
-    where :
-
-    - x_a and x_b are the samples
-    - a and b are the sample weights
-
-    Uses the algorithm detailed in [1]_
-
-    Parameters
-    ----------
-    x_a : (ns,) or (ns, 1) ndarray, float64
-        Source dirac locations (on the real line)
-    x_b : (nt,) or (ns, 1) ndarray, float64
-        Target dirac locations (on the real line)
-    a : (ns,) ndarray, float64, optional
-        Source histogram (default is uniform weight)
-    b : (nt,) ndarray, float64, optional
-        Target histogram (default is uniform weight)
-    p: float, optional (default=1.0)
-         The order of the p-Wasserstein distance to be computed
-
-    Returns
-    -------
-    dist: float
-        p-Wasserstein distance
-
-
-    Examples
-    --------
-
-    Simple example with obvious solution. The function wasserstein_1d accepts
-    lists and performs automatic conversion to numpy arrays
-
-    >>> import ot
-    >>> a=[.5, .5]
-    >>> b=[.5, .5]
-    >>> x_a = [2., 0.]
-    >>> x_b = [0., 3.]
-    >>> ot.wasserstein_1d(x_a, x_b, a, b)
-    0.5
-    >>> ot.wasserstein_1d(x_a, x_b)
-    0.5
-
-    References
-    ----------
-
-    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
-        Transport", 2018.
-
-    See Also
-    --------
-    ot.lp.emd_1d : EMD for 1d distributions
-    """
-    cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
-                       dense=False, log=False)
-    return np.power(cost_emd, 1. / p)
diff --git a/ot/lp/cvx.py b/ot/lp/cvx.py
index 8e763be..869d450 100644
--- a/ot/lp/cvx.py
+++ b/ot/lp/cvx.py
@@ -27,7 +27,7 @@ def scipy_sparse_to_spmatrix(A):
 
 
 def barycenter(A, M, weights=None, verbose=False, log=False, solver='interior-point'):
-    """Compute the Wasserstein barycenter of distributions A
+    r"""Compute the Wasserstein barycenter of distributions A
 
      The function solves the following optimization problem [16]:
 
@@ -76,7 +76,6 @@ def barycenter(A, M, weights=None, verbose=False, log=False, solver='interior-po
     .. [16] Agueh, M., & Carlier, G. (2011). Barycenters in the Wasserstein space. SIAM Journal on Mathematical Analysis, 43(2), 904-924.
 
 
-
     """
 
     if weights is None:
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index c167964..42e08f4 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -20,6 +20,7 @@ import warnings
 
 cdef extern from "EMD.h":
     int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) nogil
+    int EMD_wrap_omp(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter, int numThreads) nogil
     cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED
 
 
@@ -38,7 +39,7 @@ def check_result(result_code):
  
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"]  b, np.ndarray[double, ndim=2, mode="c"]  M, int max_iter):
+def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"]  b, np.ndarray[double, ndim=2, mode="c"]  M, int max_iter, int numThreads):
     """
         Solves the Earth Movers distance problem and returns the optimal transport matrix
 
@@ -97,8 +98,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
     cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0])
 
     cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0)
-    cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int)
-    cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int)
 
     if not len(a):
         a=np.ones((n1,))/n1
@@ -111,8 +110,10 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod
 
     # calling the function
     with nogil:
-        result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
-
+        if numThreads == 1:
+            result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter)
+        else:
+            result_code = EMD_wrap_omp(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter, numThreads)
     return G, cost, alpha, beta, result_code
 
 
@@ -157,22 +158,22 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
         cost associated to the optimal transportation
     """
     cdef double cost = 0.
-    cdef int n = u_weights.shape[0]
-    cdef int m = v_weights.shape[0]
+    cdef Py_ssize_t n = u_weights.shape[0]
+    cdef Py_ssize_t m = v_weights.shape[0]
 
-    cdef int i = 0
+    cdef Py_ssize_t i = 0
     cdef double w_i = u_weights[0]
-    cdef int j = 0
+    cdef Py_ssize_t j = 0
     cdef double w_j = v_weights[0]
 
     cdef double m_ij = 0.
 
     cdef np.ndarray[double, ndim=1, mode="c"] G = np.zeros((n + m - 1, ),
                                                            dtype=np.float64)
-    cdef np.ndarray[long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
-                                                              dtype=np.int)
-    cdef int cur_idx = 0
-    while i < n and j < m:
+    cdef np.ndarray[long long, ndim=2, mode="c"] indices = np.zeros((n + m - 1, 2),
+                                                              dtype=np.int64)
+    cdef Py_ssize_t cur_idx = 0
+    while True:
         if metric == 'sqeuclidean':
             m_ij = (u[i] - v[j]) * (u[i] - v[j])
         elif metric == 'cityblock' or metric == 'euclidean':
@@ -188,6 +189,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
             indices[cur_idx, 0] = i
             indices[cur_idx, 1] = j
             i += 1
+            if i == n:
+                break
             w_j -= w_i
             w_i = u_weights[i]
         else:
@@ -196,7 +199,10 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
             indices[cur_idx, 0] = i
             indices[cur_idx, 1] = j
             j += 1
+            if j == m:
+                break
             w_i -= w_j
             w_j = v_weights[j]
         cur_idx += 1
+    cur_idx += 1
     return G[:cur_idx], indices[:cur_idx], cost
diff --git a/ot/lp/full_bipartitegraph.h b/ot/lp/full_bipartitegraph.h
index 87a1bec..713ccb5 100644
--- a/ot/lp/full_bipartitegraph.h
+++ b/ot/lp/full_bipartitegraph.h
@@ -23,10 +23,10 @@
  *
  */
 
-#ifndef LEMON_FULL_BIPARTITE_GRAPH_H
-#define LEMON_FULL_BIPARTITE_GRAPH_H
+#pragma once
 
 #include "core.h"
+#include <cstdint>
 
 ///\ingroup graphs
 ///\file
@@ -44,16 +44,16 @@ namespace lemon {
     //class Node;
 	typedef int Node;
     //class Arc;
-	typedef long long Arc;
+	typedef int64_t Arc;
 
   protected:
 
     int _node_num;
-    long long _arc_num;
+    int64_t _arc_num;
 	
     FullBipartiteDigraphBase() {}
 
-    void construct(int n1, int n2) { _node_num = n1+n2; _arc_num = n1 * n2; _n1=n1; _n2=n2;}
+    void construct(int n1, int n2) { _node_num = n1+n2; _arc_num = (int64_t)n1 * (int64_t)n2; _n1=n1; _n2=n2;}
 
   public:
 
@@ -65,25 +65,25 @@ namespace lemon {
 
     Arc arc(const Node& s, const Node& t) const {
 		if (s<_n1 && t>=_n1)
-			return Arc(s * _n2 + (t-_n1) );
+            return Arc((int64_t)s * (int64_t)_n2 + (int64_t)(t-_n1) );
 		else
 			return Arc(-1);
     }
 
     int nodeNum() const { return _node_num; }
-    long long arcNum() const { return _arc_num; }
+    int64_t arcNum() const { return _arc_num; }
 
     int maxNodeId() const { return _node_num - 1; }
-    long long maxArcId() const { return _arc_num - 1; }
+    int64_t maxArcId() const { return _arc_num - 1; }
 
     Node source(Arc arc) const { return arc / _n2; }
     Node target(Arc arc) const { return (arc % _n2) + _n1; }
 
     static int id(Node node) { return node; }
-    static long long id(Arc arc) { return arc; }
+    static int64_t id(Arc arc) { return arc; }
 
     static Node nodeFromId(int id) { return Node(id);}
-    static Arc arcFromId(int id) { return Arc(id);}
+    static Arc arcFromId(int64_t id) { return Arc(id);}
 
 
     Arc findArc(Node s, Node t, Arc prev = -1) const {
@@ -136,7 +136,7 @@ namespace lemon {
   ///
   /// \brief A directed full graph class.
   ///
-  /// FullBipartiteDigraph is a simple and fast implmenetation of directed full
+  /// FullBipartiteDigraph is a simple and fast implementation of directed full
   /// (complete) graphs. It contains an arc from each node to each node
   /// (including a loop for each node), therefore the number of arcs
   /// is the square of the number of nodes.
@@ -203,13 +203,10 @@ namespace lemon {
     /// \brief Number of nodes.
     int nodeNum() const { return Parent::nodeNum(); }
     /// \brief Number of arcs.
-    long long arcNum() const { return Parent::arcNum(); }
+    int64_t arcNum() const { return Parent::arcNum(); }
   };
 
 
 
 
 } //namespace lemon
-
-
-#endif //LEMON_FULL_GRAPH_H
diff --git a/ot/lp/full_bipartitegraph_omp.h b/ot/lp/full_bipartitegraph_omp.h
new file mode 100644
index 0000000..8cbed0b
--- /dev/null
+++ b/ot/lp/full_bipartitegraph_omp.h
@@ -0,0 +1,234 @@
+/* -*- mode: C++; indent-tabs-mode: nil; -*-
+ *
+ * This file has been adapted by Nicolas Bonneel (2013), 
+ * from full_graph.h from LEMON, a generic C++ optimization library,
+ * to implement a lightweight fully connected bipartite graph. A previous
+ * version of this file is used as part of the Displacement Interpolation 
+ * project, 
+ * Web: http://www.cs.ubc.ca/labs/imager/tr/2011/DisplacementInterpolation/
+ * 
+ *
+ **** Original file Copyright Notice :
+ * Copyright (C) 2003-2010
+ * Egervary Jeno Kombinatorikus Optimalizalasi Kutatocsoport
+ * (Egervary Research Group on Combinatorial Optimization, EGRES).
+ *
+ * Permission to use, modify and distribute this software is granted
+ * provided that this copyright notice appears in all copies. For
+ * precise terms see the accompanying LICENSE file.
+ *
+ * This software is provided "AS IS" with no warranty of any kind,
+ * express or implied, and with no claim as to its suitability for any
+ * purpose.
+ *
+ */
+
+#pragma once
+
+#include <cstdint>
+
+///\ingroup graphs
+///\file
+///\brief FullBipartiteDigraph and FullBipartiteGraph classes.
+
+
+namespace lemon_omp {
+
+	///This \c \#define creates convenient type definitions for the following
+	///types of \c Digraph: \c Node,  \c NodeIt, \c Arc, \c ArcIt, \c InArcIt,
+	///\c OutArcIt, \c BoolNodeMap, \c IntNodeMap, \c DoubleNodeMap,
+	///\c BoolArcMap, \c IntArcMap, \c DoubleArcMap.
+	///
+	///\note If the graph type is a dependent type, ie. the graph type depend
+	///on a template parameter, then use \c TEMPLATE_DIGRAPH_TYPEDEFS()
+	///macro.
+#define DIGRAPH_TYPEDEFS(Digraph)                                       \
+  typedef Digraph::Node Node;                                           \
+  typedef Digraph::Arc Arc;                                             \
+
+
+	///Create convenience typedefs for the digraph types and iterators
+
+	///\see DIGRAPH_TYPEDEFS
+	///
+	///\note Use this macro, if the graph type is a dependent type,
+	///ie. the graph type depend on a template parameter.
+#define TEMPLATE_DIGRAPH_TYPEDEFS(Digraph)                              \
+  typedef typename Digraph::Node Node;                                  \
+  typedef typename Digraph::Arc Arc;                                    \
+
+
+  class FullBipartiteDigraphBase {
+  public:
+
+    typedef FullBipartiteDigraphBase Digraph;
+
+    //class Node;
+	typedef int Node;
+    //class Arc;
+	typedef int64_t Arc;
+
+  protected:
+
+    int _node_num;
+	int64_t _arc_num;
+	
+    FullBipartiteDigraphBase() {}
+
+    void construct(int n1, int n2) { _node_num = n1+n2; _arc_num = (int64_t)n1 * (int64_t)n2; _n1=n1; _n2=n2;}
+
+  public:
+
+	int _n1, _n2;
+
+
+    Node operator()(int ix) const { return Node(ix); }
+    static int index(const Node& node) { return node; }
+
+    Arc arc(const Node& s, const Node& t) const {
+		if (s<_n1 && t>=_n1)
+			return Arc((int64_t)s * (int64_t)_n2 + (int64_t)(t-_n1) );
+		else
+			return Arc(-1);
+    }
+
+    int nodeNum() const { return _node_num; }
+	int64_t arcNum() const { return _arc_num; }
+
+    int maxNodeId() const { return _node_num - 1; }
+	int64_t maxArcId() const { return _arc_num - 1; }
+
+    Node source(Arc arc) const { return arc / _n2; }
+    Node target(Arc arc) const { return (arc % _n2) + _n1; }
+
+    static int id(Node node) { return node; }
+    static int64_t id(Arc arc) { return arc; }
+
+    static Node nodeFromId(int id) { return Node(id);}
+    static Arc arcFromId(int64_t id) { return Arc(id);}
+
+
+    Arc findArc(Node s, Node t, Arc prev = -1) const {
+      return prev == -1 ? arc(s, t) : -1;
+    }
+
+    void first(Node& node) const {
+      node = _node_num - 1;
+    }
+
+    static void next(Node& node) {
+      --node;
+    }
+
+    void first(Arc& arc) const {
+      arc = _arc_num - 1;
+    }
+
+    static void next(Arc& arc) {
+      --arc;
+    }
+
+    void firstOut(Arc& arc, const Node& node) const {
+		if (node>=_n1)
+			arc = -1;
+		else
+			arc = (node + 1) * _n2 - 1;
+    }
+
+    void nextOut(Arc& arc) const {
+      if (arc % _n2 == 0) arc = 0;
+      --arc;
+    }
+
+    void firstIn(Arc& arc, const Node& node) const {
+		if (node<_n1)
+			arc = -1;
+		else
+			arc = _arc_num + node - _node_num;
+    }
+
+    void nextIn(Arc& arc) const {
+      arc -= _n2;
+      if (arc < 0) arc = -1;
+    }
+
+  };
+
+  /// \ingroup graphs
+  ///
+  /// \brief A directed full graph class.
+  ///
+  /// FullBipartiteDigraph is a simple and fast implmenetation of directed full
+  /// (complete) graphs. It contains an arc from each node to each node
+  /// (including a loop for each node), therefore the number of arcs
+  /// is the square of the number of nodes.
+  /// This class is completely static and it needs constant memory space.
+  /// Thus you can neither add nor delete nodes or arcs, however
+  /// the structure can be resized using resize().
+  ///
+  /// This type fully conforms to the \ref concepts::Digraph "Digraph concept".
+  /// Most of its member functions and nested classes are documented
+  /// only in the concept class.
+  ///
+  /// This class provides constant time counting for nodes and arcs.
+  ///
+  /// \note FullBipartiteDigraph and FullBipartiteGraph classes are very similar,
+  /// but there are two differences. While this class conforms only
+  /// to the \ref concepts::Digraph "Digraph" concept, FullBipartiteGraph
+  /// conforms to the \ref concepts::Graph "Graph" concept,
+  /// moreover FullBipartiteGraph does not contain a loop for each
+  /// node as this class does.
+  ///
+  /// \sa FullBipartiteGraph
+  class FullBipartiteDigraph : public FullBipartiteDigraphBase {
+    typedef FullBipartiteDigraphBase Parent;
+
+  public:
+
+    /// \brief Default constructor.
+    ///
+    /// Default constructor. The number of nodes and arcs will be zero.
+    FullBipartiteDigraph() { construct(0,0); }
+
+    /// \brief Constructor
+    ///
+    /// Constructor.
+    /// \param n The number of the nodes.
+    FullBipartiteDigraph(int n1, int n2) { construct(n1, n2); }
+
+
+    /// \brief Returns the node with the given index.
+    ///
+    /// Returns the node with the given index. Since this structure is
+    /// completely static, the nodes can be indexed with integers from
+    /// the range <tt>[0..nodeNum()-1]</tt>.
+    /// The index of a node is the same as its ID.
+    /// \sa index()
+    Node operator()(int ix) const { return Parent::operator()(ix); }
+
+    /// \brief Returns the index of the given node.
+    ///
+    /// Returns the index of the given node. Since this structure is
+    /// completely static, the nodes can be indexed with integers from
+    /// the range <tt>[0..nodeNum()-1]</tt>.
+    /// The index of a node is the same as its ID.
+    /// \sa operator()()
+    static int index(const Node& node) { return Parent::index(node); }
+
+    /// \brief Returns the arc connecting the given nodes.
+    ///
+    /// Returns the arc connecting the given nodes.
+    /*Arc arc(Node u, Node v) const {
+      return Parent::arc(u, v);
+    }*/
+
+    /// \brief Number of nodes.
+    int nodeNum() const { return Parent::nodeNum(); }
+    /// \brief Number of arcs.
+	int64_t arcNum() const { return Parent::arcNum(); }
+  };
+
+
+
+
+} //namespace lemon_omp
diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h
index 5d93040..3b46b9b 100644
--- a/ot/lp/network_simplex_simple.h
+++ b/ot/lp/network_simplex_simple.h
@@ -25,15 +25,17 @@
  *
  */
 
-#ifndef LEMON_NETWORK_SIMPLEX_SIMPLE_H
-#define LEMON_NETWORK_SIMPLEX_SIMPLE_H
+#pragma once
+#undef DEBUG_LVL
 #define DEBUG_LVL 0
 
 #if DEBUG_LVL>0
 #include <iomanip>
 #endif
 
-
+#undef EPSILON
+#undef _EPSILON
+#undef MAX_DEBUG_ITER
 #define EPSILON 2.2204460492503131e-15
 #define _EPSILON 1e-8
 #define MAX_DEBUG_ITER 100000
@@ -50,6 +52,7 @@
 #include <vector>
 #include <limits>
 #include <algorithm>
+#include <iostream>
 #include <cstdio>
 #ifdef HASHMAP
 #include <hash_map>
@@ -63,6 +66,8 @@
 //#include "sparse_array_n.h"
 #include "full_bipartitegraph.h"
 
+#undef INVALIDNODE
+#undef INVALID
 #define INVALIDNODE -1
 #define INVALID (-1)
 
@@ -76,16 +81,16 @@ namespace lemon {
 	class SparseValueVector
 	{
 	public:
-		SparseValueVector(int n=0)
+		SparseValueVector(size_t n=0)
 		{
 		}
-		void resize(int n=0){};
-		T operator[](const int id) const
+		void resize(size_t n=0){};
+        T operator[](const size_t id) const
 		{
 #ifdef HASHMAP
-			typename stdext::hash_map<int,T>::const_iterator it = data.find(id);
+            typename stdext::hash_map<size_t,T>::const_iterator it = data.find(id);
 #else
-			typename std::map<int,T>::const_iterator it = data.find(id);
+            typename std::map<size_t,T>::const_iterator it = data.find(id);
 #endif
 			if (it==data.end())
 				return 0;
@@ -93,16 +98,16 @@ namespace lemon {
 				return it->second;
 		}
 
-		ProxyObject<T> operator[](const int id)
+		ProxyObject<T> operator[](const size_t id)
 		{
 			return ProxyObject<T>( this, id );
 		}
 
         //private:
 #ifdef HASHMAP
-		stdext::hash_map<int,T> data;
+        stdext::hash_map<size_t,T> data;
 #else
-		std::map<int,T> data;
+        std::map<size_t,T> data;
 #endif
 
 	};
@@ -110,7 +115,7 @@ namespace lemon {
 	template <typename T>
 	class ProxyObject {
 	public:
-		ProxyObject( SparseValueVector<T> *v, int idx ){_v=v; _idx=idx;};
+        ProxyObject( SparseValueVector<T> *v, size_t idx ){_v=v; _idx=idx;};
 		ProxyObject<T> & operator=( const T &v ) {
 			// If we get here, we know that operator[] was called to perform a write access,
 			// so we can insert an item in the vector if needed
@@ -123,9 +128,9 @@ namespace lemon {
 			// If we get here, we know that operator[] was called to perform a read access,
 			// so we can simply return the existing object
 #ifdef HASHMAP
-			typename stdext::hash_map<int,T>::iterator it = _v->data.find(_idx);
+            typename stdext::hash_map<size_t,T>::iterator it = _v->data.find(_idx);
 #else
-			typename std::map<int,T>::iterator it = _v->data.find(_idx);
+            typename std::map<size_t,T>::iterator it = _v->data.find(_idx);
 #endif
 			if (it==_v->data.end())
 				return 0;
@@ -137,9 +142,9 @@ namespace lemon {
 		{
 			if (val==0) return;
 #ifdef HASHMAP
-			typename stdext::hash_map<int,T>::iterator it = _v->data.find(_idx);
+            typename stdext::hash_map<size_t,T>::iterator it = _v->data.find(_idx);
 #else
-			typename std::map<int,T>::iterator it = _v->data.find(_idx);
+            typename std::map<size_t,T>::iterator it = _v->data.find(_idx);
 #endif
 			if (it==_v->data.end())
 				_v->data[_idx] = val;
@@ -156,9 +161,9 @@ namespace lemon {
 		{
 			if (val==0) return;
 #ifdef HASHMAP
-			typename stdext::hash_map<int,T>::iterator it = _v->data.find(_idx);
+            typename stdext::hash_map<size_t,T>::iterator it = _v->data.find(_idx);
 #else
-			typename std::map<int,T>::iterator it = _v->data.find(_idx);
+            typename std::map<size_t,T>::iterator it = _v->data.find(_idx);
 #endif
 			if (it==_v->data.end())
 				_v->data[_idx] = -val;
@@ -173,7 +178,7 @@ namespace lemon {
 		}
 
 		SparseValueVector<T> *_v;
-		int _idx;
+        size_t _idx;
 	};
 
 
@@ -204,7 +209,7 @@ namespace lemon {
     ///
     /// \tparam GR The digraph type the algorithm runs on.
     /// \tparam V The number type used for flow amounts, capacity bounds
-    /// and supply values in the algorithm. By default, it is \c int.
+    /// and supply values in the algorithm. By default, it is \c int64_t.
     /// \tparam C The number type used for costs and potentials in the
     /// algorithm. By default, it is the same as \c V.
     ///
@@ -214,7 +219,7 @@ namespace lemon {
     /// \note %NetworkSimplexSimple provides five different pivot rule
     /// implementations, from which the most efficient one is used
     /// by default. For more information, see \ref PivotRule.
-    template <typename GR, typename V = int, typename C = V, typename NodesType = unsigned short int>
+    template <typename GR, typename V = int, typename C = V, typename NodesType = unsigned short int, typename ArcsType = int64_t>
     class NetworkSimplexSimple
     {
     public:
@@ -228,7 +233,7 @@ namespace lemon {
         /// mixed order in the internal data structure.
         /// In special cases, it could lead to better overall performance,
         /// but it is usually slower. Therefore it is disabled by default.
-        NetworkSimplexSimple(const GR& graph, bool arc_mixing, int nbnodes, long long nb_arcs,int maxiters) :
+        NetworkSimplexSimple(const GR& graph, bool arc_mixing, int nbnodes, ArcsType nb_arcs, size_t maxiters) :
         _graph(graph),  //_arc_id(graph),
         _arc_mixing(arc_mixing), _init_nb_nodes(nbnodes), _init_nb_arcs(nb_arcs),
         MAX(std::numeric_limits<Value>::max()),
@@ -288,11 +293,11 @@ namespace lemon {
 
     private:
 
-        int max_iter;
+        size_t max_iter;
         TEMPLATE_DIGRAPH_TYPEDEFS(GR);
 
         typedef std::vector<int> IntVector;
-        typedef std::vector<NodesType> UHalfIntVector;
+        typedef std::vector<ArcsType> ArcVector;
         typedef std::vector<Value> ValueVector;
         typedef std::vector<Cost> CostVector;
         //	typedef SparseValueVector<Cost> CostVector;
@@ -315,9 +320,9 @@ namespace lemon {
         // Data related to the underlying digraph
         const GR &_graph;
         int _node_num;
-        int _arc_num;
-        int _all_arc_num;
-        int _search_arc_num;
+        ArcsType _arc_num;
+        ArcsType _all_arc_num;
+        ArcsType _search_arc_num;
 
         // Parameters of the problem
         SupplyType _stype;
@@ -325,9 +330,9 @@ namespace lemon {
 
         inline int _node_id(int n) const {return _node_num-n-1;} ;
 
-	    //IntArcMap _arc_id;
-        UHalfIntVector _source;
-        UHalfIntVector _target;
+// 	    IntArcMap _arc_id;
+        IntVector _source;  // keep nodes as integers
+        IntVector _target;
         bool _arc_mixing;
     public:
         // Node and arc data
@@ -341,7 +346,7 @@ namespace lemon {
     private:
         // Data for storing the spanning tree structure
         IntVector _parent;
-        IntVector _pred;
+        ArcVector _pred;
         IntVector _thread;
         IntVector _rev_thread;
         IntVector _succ_num;
@@ -349,17 +354,17 @@ namespace lemon {
         IntVector _dirty_revs;
         BoolVector _forward;
         StateVector _state;
-        int _root;
+        ArcsType _root;
 
         // Temporary data used in the current pivot iteration
-        int in_arc, join, u_in, v_in, u_out, v_out;
-        int first, second, right, last;
-        int stem, par_stem, new_stem;
+        ArcsType in_arc, join, u_in, v_in, u_out, v_out;
+        ArcsType first, second, right, last;
+        ArcsType stem, par_stem, new_stem;
         Value delta;
 
         const Value MAX;
 
-        int mixingCoeff;
+        ArcsType mixingCoeff;
 
     public:
 
@@ -373,27 +378,27 @@ namespace lemon {
     private:
 
         // thank you to DVK and MizardX from StackOverflow for this function!
-        inline int sequence(int k) const {
-            int smallv = (k > num_total_big_subsequence_numbers) & 1;
+        inline ArcsType sequence(ArcsType k) const {
+            ArcsType smallv = (k > num_total_big_subsequence_numbers) & 1;
 
             k -= num_total_big_subsequence_numbers * smallv;
-            int subsequence_length2 = subsequence_length- smallv;
-            int subsequence_num = (k / subsequence_length2) + num_big_subseqiences * smallv;
-            int subsequence_offset = (k % subsequence_length2) * mixingCoeff;
+            ArcsType subsequence_length2 = subsequence_length- smallv;
+            ArcsType subsequence_num = (k / subsequence_length2) + num_big_subseqiences * smallv;
+            ArcsType subsequence_offset = (k % subsequence_length2) * mixingCoeff;
 
             return subsequence_offset + subsequence_num;
         }
-        int subsequence_length;
-        int num_big_subseqiences;
-        int num_total_big_subsequence_numbers;
+        ArcsType subsequence_length;
+        ArcsType num_big_subseqiences;
+        ArcsType num_total_big_subsequence_numbers;
 
-        inline int getArcID(const Arc &arc) const
+        inline ArcsType getArcID(const Arc &arc) const
         {
             //int n = _arc_num-arc._id-1;
-            int n = _arc_num-GR::id(arc)-1;
+            ArcsType n = _arc_num-GR::id(arc)-1;
 
-            //int a = mixingCoeff*(n%mixingCoeff) + n/mixingCoeff;
-            //int b = _arc_id[arc];
+            //ArcsType a = mixingCoeff*(n%mixingCoeff) + n/mixingCoeff;
+            //ArcsType b = _arc_id[arc];
             if (_arc_mixing)
                 return sequence(n);
             else
@@ -401,16 +406,16 @@ namespace lemon {
         }
 
         // finally unused because too slow
-        inline int getSource(const int arc) const
+        inline ArcsType getSource(const ArcsType arc) const
         {
-            //int a = _source[arc];
+            //ArcsType a = _source[arc];
             //return a;
 
-            int n = _arc_num-arc-1;
+            ArcsType n = _arc_num-arc-1;
             if (_arc_mixing)
                 n = mixingCoeff*(n%mixingCoeff) + n/mixingCoeff;
 
-            int b;
+            ArcsType b;
             if (n>=0)
                 b = _node_id(_graph.source(GR::arcFromId( n ) ));
             else
@@ -436,17 +441,17 @@ namespace lemon {
         private:
 
             // References to the NetworkSimplexSimple class
-            const UHalfIntVector  &_source;
-            const UHalfIntVector  &_target;
+            const IntVector  &_source;
+            const IntVector  &_target;
             const CostVector &_cost;
             const StateVector &_state;
             const CostVector &_pi;
-            int &_in_arc;
-            int _search_arc_num;
+            ArcsType &_in_arc;
+            ArcsType _search_arc_num;
 
             // Pivot rule data
-            int _block_size;
-            int _next_arc;
+            ArcsType _block_size;
+            ArcsType _next_arc;
             NetworkSimplexSimple &_ns;
 
         public:
@@ -460,17 +465,16 @@ namespace lemon {
             {
                 // The main parameters of the pivot rule
                 const double BLOCK_SIZE_FACTOR = 1.0;
-                const int MIN_BLOCK_SIZE = 10;
+                const ArcsType MIN_BLOCK_SIZE = 10;
 
-                _block_size = std::max( int(BLOCK_SIZE_FACTOR *
-                                            std::sqrt(double(_search_arc_num))),
-                                       MIN_BLOCK_SIZE );
+                _block_size = std::max(ArcsType(BLOCK_SIZE_FACTOR * std::sqrt(double(_search_arc_num))), MIN_BLOCK_SIZE);
             }
+
             // Find next entering arc
             bool findEnteringArc() {
                 Cost c, min = 0;
-                int e;
-                int cnt = _block_size;
+                ArcsType e;
+                ArcsType cnt = _block_size;
                 double a;
                     for (e = _next_arc; e != _search_arc_num; ++e) {
                         c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
@@ -516,7 +520,7 @@ namespace lemon {
 
 
         int _init_nb_nodes;
-        long long _init_nb_arcs;
+        ArcsType _init_nb_arcs;
 
         /// \name Parameters
         /// The parameters of the algorithm can be specified using these
@@ -736,7 +740,7 @@ namespace lemon {
             for (int i = 0; i != _node_num; ++i) {
                 _supply[i] = 0;
             }
-            for (int i = 0; i != _arc_num; ++i) {
+            for (ArcsType i = 0; i != _arc_num; ++i) {
                 _cost[i] = 1;
             }
             _stype = GEQ;
@@ -745,7 +749,7 @@ namespace lemon {
 
 
 
-        int divid (int x, int y)
+        int64_t divid (int64_t x, int64_t y)
         {
             return (x-x%y)/y;
         }
@@ -775,7 +779,7 @@ namespace lemon {
             _node_num = _init_nb_nodes;
             _arc_num = _init_nb_arcs;
             int all_node_num = _node_num + 1;
-            int max_arc_num = _arc_num + 2 * _node_num;
+            ArcsType max_arc_num = _arc_num + 2 * _node_num;
 
             _source.resize(max_arc_num);
             _target.resize(max_arc_num);
@@ -798,13 +802,13 @@ namespace lemon {
             //_arc_mixing=false;
             if (_arc_mixing) {
                 // Store the arcs in a mixed order
-                int k = std::max(int(std::sqrt(double(_arc_num))), 10);
+                const ArcsType k = std::max(ArcsType(std::sqrt(double(_arc_num))), ArcsType(10));
                 mixingCoeff = k;
                 subsequence_length = _arc_num / mixingCoeff + 1;
                 num_big_subseqiences = _arc_num % mixingCoeff;
                 num_total_big_subsequence_numbers = subsequence_length * num_big_subseqiences;
 
-                int i = 0, j = 0;
+                ArcsType i = 0, j = 0;
                 Arc a; _graph.first(a);
                 for (; a != INVALID; _graph.next(a)) {
                     _source[i] = _node_id(_graph.source(a));
@@ -814,7 +818,7 @@ namespace lemon {
                 }
             } else {
                 // Store the arcs in the original order
-                int i = 0;
+                ArcsType i = 0;
                 Arc a; _graph.first(a);
                 for (; a != INVALID; _graph.next(a), ++i) {
                     _source[i] = _node_id(_graph.source(a));
@@ -856,7 +860,7 @@ namespace lemon {
          Number totalCost() const {
          Number c = 0;
          for (ArcIt a(_graph); a != INVALID; ++a) {
-         int i = getArcID(a);
+         int64_t i = getArcID(a);
          c += Number(_flow[i]) * Number(_cost[i]);
          }
          return c;
@@ -867,15 +871,15 @@ namespace lemon {
             Number c = 0;
 
             /*#ifdef HASHMAP
-             typename stdext::hash_map<int, Value>::const_iterator it;
+             typename stdext::hash_map<int64_t, Value>::const_iterator it;
              #else
-             typename std::map<int, Value>::const_iterator it;
+             typename std::map<int64_t, Value>::const_iterator it;
              #endif
              for (it = _flow.data.begin(); it!=_flow.data.end(); ++it)
              c += Number(it->second) * Number(_cost[it->first]);
              return c;*/
 
-            for (unsigned long i=0; i<_flow.size(); i++)
+            for (ArcsType i=0; i<_flow.size(); i++)
                 c += _flow[i] * Number(_cost[i]);
             return c;
 
@@ -944,14 +948,14 @@ namespace lemon {
         // Initialize internal data structures
         bool init() {
             if (_node_num == 0) return false;
-            
+
             // Check the sum of supply values
             _sum_supply = 0;
             for (int i = 0; i != _node_num; ++i) {
                 _sum_supply += _supply[i];
             }
             if ( fabs(_sum_supply) > _EPSILON ) return false;
-            
+
 			_sum_supply = 0;
 
             // Initialize artifical cost
@@ -960,14 +964,14 @@ namespace lemon {
                 ART_COST = std::numeric_limits<Cost>::max() / 2 + 1;
             } else {
                 ART_COST = 0;
-                for (int i = 0; i != _arc_num; ++i) {
+                for (ArcsType i = 0; i != _arc_num; ++i) {
                     if (_cost[i] > ART_COST) ART_COST = _cost[i];
                 }
                 ART_COST = (ART_COST + 1) * _node_num;
             }
 
             // Initialize arc maps
-            for (int i = 0; i != _arc_num; ++i) {
+            for (ArcsType i = 0; i != _arc_num; ++i) {
                 //_flow[i] = 0; //by default, the sparse matrix is empty
                 _state[i] = STATE_LOWER;
             }
@@ -988,7 +992,7 @@ namespace lemon {
                 // EQ supply constraints
                 _search_arc_num = _arc_num;
                 _all_arc_num = _arc_num + _node_num;
-                for (int u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+                for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
                     _parent[u] = _root;
                     _pred[u] = e;
                     _thread[u] = u + 1;
@@ -1016,8 +1020,8 @@ namespace lemon {
             else if (_sum_supply > 0) {
                 // LEQ supply constraints
                 _search_arc_num = _arc_num + _node_num;
-                int f = _arc_num + _node_num;
-                for (int u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+                ArcsType f = _arc_num + _node_num;
+                for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
                     _parent[u] = _root;
                     _thread[u] = u + 1;
                     _rev_thread[u + 1] = u;
@@ -1054,8 +1058,8 @@ namespace lemon {
             else {
                 // GEQ supply constraints
                 _search_arc_num = _arc_num + _node_num;
-                int f = _arc_num + _node_num;
-                for (int u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+                ArcsType f = _arc_num + _node_num;
+                for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
                     _parent[u] = _root;
                     _thread[u] = u + 1;
                     _rev_thread[u + 1] = u;
@@ -1120,9 +1124,9 @@ namespace lemon {
                 second = _source[in_arc];
             }
             delta = INF;
-            int result = 0;
+            char result = 0;
             Value d;
-            int e;
+            ArcsType e;
 
             // Search the cycle along the path form the first node to the root
             for (int u = first; u != join; u = _parent[u]) {
@@ -1239,7 +1243,7 @@ namespace lemon {
 
             // Update _rev_thread using the new _thread values
             for (int i = 0; i != int(_dirty_revs.size()); ++i) {
-                u = _dirty_revs[i];
+                int u = _dirty_revs[i];
                 _rev_thread[_thread[u]] = u;
             }
 
@@ -1257,7 +1261,7 @@ namespace lemon {
                 u = w;
             }
             _pred[u_in] = in_arc;
-            _forward[u_in] = ((unsigned int)u_in == _source[in_arc]);
+            _forward[u_in] = (u_in == _source[in_arc]);
             _succ_num[u_in] = old_succ_num;
 
             // Set limits for updating _last_succ form v_in and v_out
@@ -1328,7 +1332,7 @@ namespace lemon {
             if (_sum_supply > 0) total -= _sum_supply;
             if (total <= 0) return true;
 
-            IntVector arc_vector;
+            ArcVector arc_vector;
             if (_sum_supply >= 0) {
                 if (supply_nodes.size() == 1 && demand_nodes.size() == 1) {
                     // Perform a reverse graph search from the sink to the source
@@ -1345,7 +1349,7 @@ namespace lemon {
                         Arc a; _graph.firstIn(a, v);
                         for (; a != INVALID; _graph.nextIn(a)) {
                             if (reached[u = _graph.source(a)]) continue;
-                            int j = getArcID(a);
+                            ArcsType j = getArcID(a);
                             if (INF >= total) {
                                 arc_vector.push_back(j);
                                 reached[u] = true;
@@ -1355,7 +1359,7 @@ namespace lemon {
                     }
                 } else {
                     // Find the min. cost incomming arc for each demand node
-                    for (int i = 0; i != int(demand_nodes.size()); ++i) {
+                    for (int i = 0; i != demand_nodes.size(); ++i) {
                         Node v = demand_nodes[i];
                         Cost c, min_cost = std::numeric_limits<Cost>::max();
                         Arc min_arc = INVALID;
@@ -1393,7 +1397,7 @@ namespace lemon {
             }
 
             // Perform heuristic initial pivots
-            for (int i = 0; i != int(arc_vector.size()); ++i) {
+            for (ArcsType i = 0; i != arc_vector.size(); ++i) {
                 in_arc = arc_vector[i];
                 // l'erreur est probablement ici...
                 if (_state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -
@@ -1423,7 +1427,7 @@ namespace lemon {
             // Perform heuristic initial pivots
             if (!initialPivots()) return UNBOUNDED;
 
-            int iter_number=0;
+            size_t iter_number=0;
             //pivot.setDantzig(true);
             // Execute the Network Simplex algorithm
             while (pivot.findEnteringArc()) {
@@ -1443,7 +1447,7 @@ namespace lemon {
                     double a;
                     a= (fabs(_pi[_source[in_arc]])>=fabs(_pi[_target[in_arc]])) ? fabs(_pi[_source[in_arc]]) : fabs(_pi[_target[in_arc]]);
                     a=a>=fabs(_cost[in_arc])?a:fabs(_cost[in_arc]);
-                    for (int i=0; i<_flow.size(); i++) {
+                    for (int64_t i=0; i<_flow.size(); i++) {
                         sumFlow+=_state[i]*_flow[i];
                     }
                     std::cout << "Sum of the flow " << std::setprecision(20) << sumFlow << "\n" << iter_number << " iterations, current cost=" << curCost << "\nReduced cost=" << _state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -_pi[_target[in_arc]]) << "\nPrecision = "<< -EPSILON*(a) << "\n";
@@ -1482,12 +1486,12 @@ namespace lemon {
                 double a;
                 a= (fabs(_pi[_source[in_arc]])>=fabs(_pi[_target[in_arc]])) ? fabs(_pi[_source[in_arc]]) : fabs(_pi[_target[in_arc]]);
                 a=a>=fabs(_cost[in_arc])?a:fabs(_cost[in_arc]);
-                for (int i=0; i<_flow.size(); i++) {
+                for (int64_t i=0; i<_flow.size(); i++) {
                     sumFlow+=_state[i]*_flow[i];
                 }
-            
+
                 std::cout << "Sum of the flow " << std::setprecision(20) << sumFlow << "\n" << niter << " iterations, current cost=" << curCost << "\nReduced cost=" << _state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -_pi[_target[in_arc]]) << "\nPrecision = "<< -EPSILON*(a) << "\n";
-            
+
                 std::cout << "Arc in = (" << _node_id(_source[in_arc]) << ", " << _node_id(_target[in_arc]) <<")\n";
                 std::cout << "Supplies = (" << _supply[_source[in_arc]] << ", " << _supply[_target[in_arc]] << ")\n";
 
@@ -1505,9 +1509,9 @@ namespace lemon {
 #endif
             // Check feasibility
 			if( retVal == OPTIMAL){
-                for (int e = _search_arc_num; e != _all_arc_num; ++e) {
+                for (ArcsType e = _search_arc_num; e != _all_arc_num; ++e) {
                     if (_flow[e] != 0){
-                        if (abs(_flow[e]) > EPSILON)
+                        if (fabs(_flow[e]) > _EPSILON) // change of the original code following issue #126
                             return INFEASIBLE;
                         else
                             _flow[e]=0;
@@ -1521,20 +1525,20 @@ namespace lemon {
             if (_sum_supply == 0) {
                 if (_stype == GEQ) {
                     Cost max_pot = -std::numeric_limits<Cost>::max();
-                    for (int i = 0; i != _node_num; ++i) {
+                    for (ArcsType i = 0; i != _node_num; ++i) {
                         if (_pi[i] > max_pot) max_pot = _pi[i];
                     }
                     if (max_pot > 0) {
-                        for (int i = 0; i != _node_num; ++i)
+                        for (ArcsType i = 0; i != _node_num; ++i)
                             _pi[i] -= max_pot;
                     }
                 } else {
                     Cost min_pot = std::numeric_limits<Cost>::max();
-                    for (int i = 0; i != _node_num; ++i) {
+                    for (ArcsType i = 0; i != _node_num; ++i) {
                         if (_pi[i] < min_pot) min_pot = _pi[i];
                     }
                     if (min_pot < 0) {
-                        for (int i = 0; i != _node_num; ++i)
+                        for (ArcsType i = 0; i != _node_num; ++i)
                             _pi[i] -= min_pot;
                     }
                 }
@@ -1548,5 +1552,3 @@ namespace lemon {
     ///@}
 
 } //namespace lemon
-
-#endif //LEMON_NETWORK_SIMPLEX_H
diff --git a/ot/lp/network_simplex_simple_omp.h b/ot/lp/network_simplex_simple_omp.h
new file mode 100644
index 0000000..87e4c05
--- /dev/null
+++ b/ot/lp/network_simplex_simple_omp.h
@@ -0,0 +1,1699 @@
+/* -*- mode: C++; indent-tabs-mode: nil; -*-
+*
+*
+* This file has been adapted by Nicolas Bonneel (2013),
+* from network_simplex.h from LEMON, a generic C++ optimization library,
+* to implement a lightweight network simplex for mass transport, more
+* memory efficient than the original file. A previous version of this file
+* is used as part of the Displacement Interpolation project,
+* Web: http://www.cs.ubc.ca/labs/imager/tr/2011/DisplacementInterpolation/
+*
+* Revisions:
+* March 2015: added OpenMP parallelization
+* March 2017: included Antoine Rolet's trick to make it more robust
+* April 2018: IMPORTANT bug fix + uses 64bit integers (slightly slower but less risks of overflows), updated to a newer version of the algo by LEMON, sparse flow by default + minor edits.
+*
+*
+**** Original file Copyright Notice :
+*
+* Copyright (C) 2003-2010
+* Egervary Jeno Kombinatorikus Optimalizalasi Kutatocsoport
+* (Egervary Research Group on Combinatorial Optimization, EGRES).
+*
+* Permission to use, modify and distribute this software is granted
+* provided that this copyright notice appears in all copies. For
+* precise terms see the accompanying LICENSE file.
+*
+* This software is provided "AS IS" with no warranty of any kind,
+* express or implied, and with no claim as to its suitability for any
+* purpose.
+*
+*/
+
+#pragma once
+#undef DEBUG_LVL
+#define DEBUG_LVL 0
+
+#if DEBUG_LVL>0
+#include <iomanip>
+#endif
+
+#undef EPSILON
+#undef _EPSILON
+#undef MAX_DEBUG_ITER
+#define EPSILON std::numeric_limits<Cost>::epsilon()*10
+#define _EPSILON 1e-8
+#define MAX_DEBUG_ITER 100000
+
+/// \ingroup min_cost_flow_algs
+///
+/// \file
+/// \brief Network Simplex algorithm for finding a minimum cost flow.
+
+// if your compiler has troubles with unorderedmaps, just comment the following line to use a slower std::map instead
+#define HASHMAP        // now handled with unorderedmaps instead of stdext::hash_map. Should be better supported.
+
+#define SPARSE_FLOW    // a sparse flow vector will be 10-15% slower for small problems but uses less memory and becomes faster for large problems (40k total nodes)
+
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <iostream>
+#ifdef HASHMAP
+#include <unordered_map>
+#else
+#include <map>
+#endif
+//#include "core.h"
+//#include "lmath.h"
+
+#ifdef OMP
+#include <omp.h>
+#endif
+#include <cmath>
+
+
+//#include "sparse_array_n.h"
+#include "full_bipartitegraph_omp.h"
+
+#undef INVALIDNODE
+#undef INVALID
+#define INVALIDNODE -1
+#define INVALID (-1)
+
+namespace lemon_omp {
+
+    int64_t max_threads = -1;
+
+	template <typename T>
+	class ProxyObject;
+
+	template<typename T>
+	class SparseValueVector
+	{
+	public:
+		SparseValueVector(size_t n = 0)   // parameter n for compatibility with standard vectors
+		{
+		}
+		void resize(size_t n = 0) {};
+		T operator[](const size_t id) const
+		{
+#ifdef HASHMAP
+			typename std::unordered_map<size_t, T>::const_iterator it = data.find(id);
+#else
+			typename std::map<size_t, T>::const_iterator it = data.find(id);
+#endif
+			if (it == data.end())
+				return 0;
+			else
+				return it->second;
+		}
+
+		ProxyObject<T> operator[](const size_t id)
+		{
+			return ProxyObject<T>(this, id);
+		}
+
+		//private:
+#ifdef HASHMAP
+		std::unordered_map<size_t, T> data;
+#else
+		std::map<size_t, T> data;
+#endif
+
+	};
+
+	template <typename T>
+	class ProxyObject {
+	public:
+		ProxyObject(SparseValueVector<T> *v, size_t idx) { _v = v; _idx = idx; };
+		ProxyObject<T> & operator=(const T &v) {
+			// If we get here, we know that operator[] was called to perform a write access,
+			// so we can insert an item in the vector if needed
+			if (v != 0)
+				_v->data[_idx] = v;
+			return *this;
+		}
+
+		operator T() {
+			// If we get here, we know that operator[] was called to perform a read access,
+			// so we can simply return the existing object
+#ifdef HASHMAP
+			typename std::unordered_map<size_t, T>::iterator it = _v->data.find(_idx);
+#else
+			typename std::map<size_t, T>::iterator it = _v->data.find(_idx);
+#endif
+			if (it == _v->data.end())
+				return 0;
+			else
+				return it->second;
+		}
+
+		void operator+=(T val)
+		{
+			if (val == 0) return;
+#ifdef HASHMAP
+			typename std::unordered_map<size_t, T>::iterator it = _v->data.find(_idx);
+#else
+			typename std::map<size_t, T>::iterator it = _v->data.find(_idx);
+#endif
+			if (it == _v->data.end())
+				_v->data[_idx] = val;
+			else
+			{
+				T sum = it->second + val;
+				if (sum == 0)
+					_v->data.erase(it);
+				else
+					it->second = sum;
+			}
+		}
+		void operator-=(T val)
+		{
+			if (val == 0) return;
+#ifdef HASHMAP
+			typename std::unordered_map<size_t, T>::iterator it = _v->data.find(_idx);
+#else
+			typename std::map<size_t, T>::iterator it = _v->data.find(_idx);
+#endif
+			if (it == _v->data.end())
+				_v->data[_idx] = -val;
+			else
+			{
+				T sum = it->second - val;
+				if (sum == 0)
+					_v->data.erase(it);
+				else
+					it->second = sum;
+			}
+		}
+
+		SparseValueVector<T> *_v;
+		size_t _idx;
+	};
+
+
+
+	/// \addtogroup min_cost_flow_algs
+	/// @{
+
+	/// \brief Implementation of the primal Network Simplex algorithm
+	/// for finding a \ref min_cost_flow "minimum cost flow".
+	///
+	/// \ref NetworkSimplexSimple implements the primal Network Simplex algorithm
+	/// for finding a \ref min_cost_flow "minimum cost flow"
+	/// \ref amo93networkflows, \ref dantzig63linearprog,
+	/// \ref kellyoneill91netsimplex.
+	/// This algorithm is a highly efficient specialized version of the
+	/// linear programming simplex method directly for the minimum cost
+	/// flow problem.
+	///
+	/// In general, %NetworkSimplexSimple is the fastest implementation available
+	/// in LEMON for this problem.
+	/// Moreover, it supports both directions of the supply/demand inequality
+	/// constraints. For more information, see \ref SupplyType.
+	///
+	/// Most of the parameters of the problem (except for the digraph)
+	/// can be given using separate functions, and the algorithm can be
+	/// executed using the \ref run() function. If some parameters are not
+	/// specified, then default values will be used.
+	///
+	/// \tparam GR The digraph type the algorithm runs on.
+	/// \tparam V The number type used for flow amounts, capacity bounds
+	/// and supply values in the algorithm. By default, it is \c int.
+	/// \tparam C The number type used for costs and potentials in the
+	/// algorithm. By default, it is the same as \c V.
+	///
+	/// \warning Both number types must be signed and all input data must
+	/// be integer.
+	///
+	/// \note %NetworkSimplexSimple provides five different pivot rule
+	/// implementations, from which the most efficient one is used
+	/// by default. For more information, see \ref PivotRule.
+	template <typename GR, typename V = int, typename C = V, typename ArcsType = int64_t>
+	class NetworkSimplexSimple
+	{
+	public:
+
+		/// \brief Constructor.
+		///
+		/// The constructor of the class.
+		///
+		/// \param graph The digraph the algorithm runs on.
+		/// \param arc_mixing Indicate if the arcs have to be stored in a
+		/// mixed order in the internal data structure.
+		/// In special cases, it could lead to better overall performance,
+		/// but it is usually slower. Therefore it is disabled by default.
+		NetworkSimplexSimple(const GR& graph, bool arc_mixing, int nbnodes, ArcsType nb_arcs, size_t maxiters = 0, int numThreads=-1) :
+			_graph(graph),  //_arc_id(graph),
+			_arc_mixing(arc_mixing), _init_nb_nodes(nbnodes), _init_nb_arcs(nb_arcs),
+			MAX(std::numeric_limits<Value>::max()),
+			INF(std::numeric_limits<Value>::has_infinity ?
+				std::numeric_limits<Value>::infinity() : MAX)
+		{
+			// Reset data structures
+			reset();
+			max_iter = maxiters;
+#ifdef OMP
+            if (max_threads < 0) {
+                max_threads = omp_get_max_threads();
+            }
+            if (numThreads > 0 && numThreads<=max_threads){
+                num_threads = numThreads;
+            } else if (numThreads == -1 || numThreads>max_threads) {
+                num_threads = max_threads;
+            } else {
+                num_threads = 1;
+            }
+            omp_set_num_threads(num_threads);
+#else
+            num_threads = 1;
+#endif
+		}
+
+		/// The type of the flow amounts, capacity bounds and supply values
+		typedef V Value;
+		/// The type of the arc costs
+		typedef C Cost;
+
+	public:
+		/// \brief Problem type constants for the \c run() function.
+		///
+		/// Enum type containing the problem type constants that can be
+		/// returned by the \ref run() function of the algorithm.
+		enum ProblemType {
+			/// The problem has no feasible solution (flow).
+			INFEASIBLE,
+			/// The problem has optimal solution (i.e. it is feasible and
+			/// bounded), and the algorithm has found optimal flow and node
+			/// potentials (primal and dual solutions).
+			OPTIMAL,
+			/// The objective function of the problem is unbounded, i.e.
+			/// there is a directed cycle having negative total cost and
+			/// infinite upper bound.
+			UNBOUNDED,
+			// The maximum number of iteration has been reached
+			MAX_ITER_REACHED
+		};
+
+		/// \brief Constants for selecting the type of the supply constraints.
+		///
+		/// Enum type containing constants for selecting the supply type,
+		/// i.e. the direction of the inequalities in the supply/demand
+		/// constraints of the \ref min_cost_flow "minimum cost flow problem".
+		///
+		/// The default supply type is \c GEQ, the \c LEQ type can be
+		/// selected using \ref supplyType().
+		/// The equality form is a special case of both supply types.
+		enum SupplyType {
+			/// This option means that there are <em>"greater or equal"</em>
+			/// supply/demand constraints in the definition of the problem.
+			GEQ,
+			/// This option means that there are <em>"less or equal"</em>
+			/// supply/demand constraints in the definition of the problem.
+			LEQ
+		};
+
+
+
+	private:
+		size_t max_iter;
+		int num_threads;
+		TEMPLATE_DIGRAPH_TYPEDEFS(GR);
+
+		typedef std::vector<int> IntVector;
+		typedef std::vector<ArcsType> ArcVector;
+		typedef std::vector<Value> ValueVector;
+		typedef std::vector<Cost> CostVector;
+		//	typedef SparseValueVector<Cost> CostVector;
+		typedef std::vector<char> BoolVector;
+		// Note: vector<char> is used instead of vector<bool> for efficiency reasons
+
+		// State constants for arcs
+		enum ArcState {
+			STATE_UPPER = -1,
+			STATE_TREE = 0,
+			STATE_LOWER = 1
+		};
+
+		typedef std::vector<signed char> StateVector;
+		// Note: vector<signed char> is used instead of vector<ArcState> for
+		// efficiency reasons
+
+	private:
+
+		// Data related to the underlying digraph
+		const GR &_graph;
+		int _node_num;
+		ArcsType _arc_num;
+		ArcsType _all_arc_num;
+		ArcsType _search_arc_num;
+
+		// Parameters of the problem
+		SupplyType _stype;
+		Value _sum_supply;
+
+		inline int _node_id(int n) const { return _node_num - n - 1; };
+
+		//IntArcMap _arc_id;
+		IntVector _source;  // keep nodes as integers
+		IntVector _target;
+		bool _arc_mixing;
+
+		// Node and arc data
+		CostVector _cost;
+		ValueVector _supply;
+#ifdef SPARSE_FLOW
+		SparseValueVector<Value> _flow;
+#else
+		ValueVector _flow;
+#endif
+
+		CostVector _pi;
+
+		// Data for storing the spanning tree structure
+		IntVector _parent;
+		ArcVector _pred;
+		IntVector _thread;
+		IntVector _rev_thread;
+		IntVector _succ_num;
+		IntVector _last_succ;
+		IntVector _dirty_revs;
+		BoolVector _forward;
+		StateVector _state;
+		ArcsType _root;
+
+		// Temporary data used in the current pivot iteration
+		ArcsType in_arc, join, u_in, v_in, u_out, v_out;
+		ArcsType first, second, right, last;
+		ArcsType stem, par_stem, new_stem;
+		Value delta;
+
+		const Value MAX;
+
+		ArcsType mixingCoeff;
+
+	public:
+
+		/// \brief Constant for infinite upper bounds (capacities).
+		///
+		/// Constant for infinite upper bounds (capacities).
+		/// It is \c std::numeric_limits<Value>::infinity() if available,
+		/// \c std::numeric_limits<Value>::max() otherwise.
+		const Value INF;
+
+	private:
+
+		// thank you to DVK and MizardX from StackOverflow for this function!
+		inline ArcsType sequence(ArcsType k) const {
+			ArcsType smallv = (k > num_total_big_subsequence_numbers) & 1;
+
+			k -= num_total_big_subsequence_numbers * smallv;
+			ArcsType subsequence_length2 = subsequence_length - smallv;
+			ArcsType subsequence_num = (k / subsequence_length2) + num_big_subsequences * smallv;
+			ArcsType subsequence_offset = (k % subsequence_length2) * mixingCoeff;
+
+			return subsequence_offset + subsequence_num;
+		}
+		ArcsType subsequence_length;
+		ArcsType num_big_subsequences;
+		ArcsType num_total_big_subsequence_numbers;
+
+		inline ArcsType getArcID(const Arc &arc) const
+		{
+			//int n = _arc_num-arc._id-1;
+			ArcsType n = _arc_num - GR::id(arc) - 1;
+
+			//ArcsType a = mixingCoeff*(n%mixingCoeff) + n/mixingCoeff; 
+			//ArcsType b = _arc_id[arc];
+			if (_arc_mixing)
+				return sequence(n);
+			else
+				return n;
+		}
+
+		// finally unused because too slow
+		inline ArcsType getSource(const ArcsType arc) const
+		{
+			//ArcsType a = _source[arc];
+			//return a;
+
+			ArcsType n = _arc_num - arc - 1;
+			if (_arc_mixing)
+				n = mixingCoeff*(n%mixingCoeff) + n / mixingCoeff;
+
+			ArcsType b;
+			if (n >= 0)
+				b = _node_id(_graph.source(GR::arcFromId(n)));
+			else
+			{
+				n = arc + 1 - _arc_num;
+				if (n <= _node_num)
+					b = _node_num;
+				else
+					if (n >= _graph._n1)
+						b = _graph._n1;
+					else
+						b = _graph._n1 - n;
+			}
+
+			return b;
+		}
+
+
+
+		// Implementation of the Block Search pivot rule
+		class BlockSearchPivotRule
+		{
+		private:
+
+			// References to the NetworkSimplexSimple class
+			const IntVector  &_source;
+			const IntVector  &_target;
+			const CostVector &_cost;
+			const StateVector &_state;
+			const CostVector &_pi;
+			ArcsType &_in_arc;
+			ArcsType _search_arc_num;
+
+			// Pivot rule data
+			ArcsType _block_size;
+			ArcsType _next_arc;
+			NetworkSimplexSimple &_ns;
+
+		public:
+
+			// Constructor
+			BlockSearchPivotRule(NetworkSimplexSimple &ns) :
+				_source(ns._source), _target(ns._target),
+				_cost(ns._cost), _state(ns._state), _pi(ns._pi),
+				_in_arc(ns.in_arc), _search_arc_num(ns._search_arc_num),
+				_next_arc(0), _ns(ns)
+			{
+				// The main parameters of the pivot rule
+				const double BLOCK_SIZE_FACTOR = 1;
+				const ArcsType MIN_BLOCK_SIZE = 10;
+
+				_block_size = std::max(ArcsType(BLOCK_SIZE_FACTOR *	std::sqrt(double(_search_arc_num))), MIN_BLOCK_SIZE);
+			}
+
+			// Find next entering arc
+			bool findEnteringArc() {
+				Cost min_val = 0;
+
+                ArcsType N = _ns.num_threads;
+
+				std::vector<Cost> minArray(N, 0);
+				std::vector<ArcsType> arcId(N);
+				ArcsType bs = (ArcsType)ceil(_block_size / (double)N);
+
+				for (ArcsType i = 0; i < _search_arc_num; i += _block_size) {
+
+					ArcsType e;
+					int j;
+#pragma omp parallel
+					{
+#ifdef OMP
+						int t = omp_get_thread_num();
+#else
+						int t = 0;
+#endif
+
+#pragma omp for schedule(static, bs) lastprivate(e)
+						for (j = 0; j < std::min(i + _block_size, _search_arc_num) - i; j++) {
+							e = (_next_arc + i + j); if (e >= _search_arc_num) e -= _search_arc_num;
+							Cost c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
+							if (c < minArray[t]) {
+								minArray[t] = c;
+								arcId[t] = e;
+							}
+						}
+					}
+					for (int j = 0; j < N; j++) {
+						if (minArray[j] < min_val) {
+							min_val = minArray[j];
+							_in_arc = arcId[j];
+						}
+					}
+					Cost a = std::abs(_pi[_source[_in_arc]]) > std::abs(_pi[_target[_in_arc]]) ? std::abs(_pi[_source[_in_arc]]) : std::abs(_pi[_target[_in_arc]]);
+					a = a > std::abs(_cost[_in_arc]) ? a : std::abs(_cost[_in_arc]);
+					if (min_val < -EPSILON*a) {
+						_next_arc = e;
+						return true;
+					}
+				}
+
+				Cost a = fabs(_pi[_source[_in_arc]]) > fabs(_pi[_target[_in_arc]]) ? fabs(_pi[_source[_in_arc]]) : fabs(_pi[_target[_in_arc]]);
+				a = a > fabs(_cost[_in_arc]) ? a : fabs(_cost[_in_arc]);
+				if (min_val >= -EPSILON*a) return false;
+
+				return true;
+			}
+
+
+			// Find next entering arc 
+			/*bool findEnteringArc() {
+				Cost min_val = 0;
+				int N = omp_get_max_threads();
+				std::vector<Cost> minArray(N);
+				std::vector<ArcsType> arcId(N);
+
+				ArcsType bs = (ArcsType)ceil(_block_size / (double)N);
+				for (ArcsType i = 0; i < _search_arc_num; i += _block_size) {
+
+					ArcsType maxJ = std::min(i + _block_size, _search_arc_num) - i;
+					ArcsType j;
+#pragma omp parallel
+					{
+						int t = omp_get_thread_num();
+						Cost minV = 0;
+						ArcsType arcStart = _next_arc + i;
+						ArcsType arc = -1;
+#pragma omp for schedule(static, bs)
+						for (j = 0; j < maxJ; j++) {
+							ArcsType e = arcStart + j; if (e >= _search_arc_num) e -= _search_arc_num;
+							Cost c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
+							if (c < minV) {
+								minV = c;
+								arc = e;
+							}
+						}
+
+						minArray[t] = minV;
+						arcId[t] = arc;
+					}
+					for (int j = 0; j < N; j++) {
+						if (minArray[j] < min_val) {
+							min_val = minArray[j];
+							_in_arc = arcId[j];
+						}
+					}
+
+					//FIX by Antoine Rolet to avoid precision issues
+					Cost a = std::max(std::abs(_cost[_in_arc]), std::max(std::abs(_pi[_source[_in_arc]]), std::abs(_pi[_target[_in_arc]])));
+					if (min_val <-std::numeric_limits<Cost>::epsilon()*a) {
+						_next_arc = _next_arc + i + maxJ - 1;
+						if (_next_arc >= _search_arc_num) _next_arc -= _search_arc_num;
+						return true;
+					}
+				}
+
+				if (min_val >= 0) {
+					return false;
+				}
+
+				return true;
+			}*/
+			
+
+			/*bool findEnteringArc() {
+				Cost c, min = 0;
+				int cnt = _block_size;
+				int e, min_arc = _next_arc;
+				for (e = _next_arc; e < _search_arc_num; ++e) {
+					c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
+					if (c < min) {
+						min = c;
+						min_arc = e;
+
+					}
+					if (--cnt == 0) {
+						if (min < 0) break;
+						cnt = _block_size;
+
+					}
+
+				}
+				if (min == 0 || cnt > 0) {
+					for (e = 0; e < _next_arc; ++e) {
+						c = _state[e] * (_cost[e] + _pi[_source[e]] - _pi[_target[e]]);
+						if (c < min) {
+							min = c;
+							min_arc = e;
+
+						}
+						if (--cnt == 0) {
+							if (min < 0) break;
+							cnt = _block_size;
+
+						}
+
+					}
+
+				}
+				if (min >= 0) return false;
+				_in_arc = min_arc;
+				_next_arc = e;
+				return true;
+			}*/
+
+
+
+		}; //class BlockSearchPivotRule
+
+
+
+	public:
+
+
+
+		int _init_nb_nodes;
+		ArcsType _init_nb_arcs;
+
+		/// \name Parameters
+		/// The parameters of the algorithm can be specified using these
+		/// functions.
+
+		/// @{
+
+
+		/// \brief Set the costs of the arcs.
+		///
+		/// This function sets the costs of the arcs.
+		/// If it is not used before calling \ref run(), the costs
+		/// will be set to \c 1 on all arcs.
+		///
+		/// \param map An arc map storing the costs.
+		/// Its \c Value type must be convertible to the \c Cost type
+		/// of the algorithm.
+		///
+		/// \return <tt>(*this)</tt>
+		template<typename CostMap>
+		NetworkSimplexSimple& costMap(const CostMap& map) {
+			Arc a; _graph.first(a);
+			for (; a != INVALID; _graph.next(a)) {
+				_cost[getArcID(a)] = map[a];
+			}
+			return *this;
+		}
+
+
+		/// \brief Set the costs of one arc.
+		///
+		/// This function sets the costs of one arcs.
+		/// Done for memory reasons
+		///
+		/// \param arc An arc.
+		/// \param arc A cost
+		///
+		/// \return <tt>(*this)</tt>
+		template<typename Value>
+		NetworkSimplexSimple& setCost(const Arc& arc, const Value cost) {
+			_cost[getArcID(arc)] = cost;
+			return *this;
+		}
+
+
+		/// \brief Set the supply values of the nodes.
+		///
+		/// This function sets the supply values of the nodes.
+		/// If neither this function nor \ref stSupply() is used before
+		/// calling \ref run(), the supply of each node will be set to zero.
+		///
+		/// \param map A node map storing the supply values.
+		/// Its \c Value type must be convertible to the \c Value type
+		/// of the algorithm.
+		///
+		/// \return <tt>(*this)</tt>
+		template<typename SupplyMap>
+		NetworkSimplexSimple& supplyMap(const SupplyMap& map) {
+			Node n; _graph.first(n);
+			for (; n != INVALIDNODE; _graph.next(n)) {
+				_supply[_node_id(n)] = map[n];
+			}
+			return *this;
+		}
+		template<typename SupplyMap>
+		NetworkSimplexSimple& supplyMap(const SupplyMap* map1, int n1, const SupplyMap* map2, int n2) {
+			Node n; _graph.first(n);
+			for (; n != INVALIDNODE; _graph.next(n)) {
+				if (n<n1)
+					_supply[_node_id(n)] = map1[n];
+				else
+					_supply[_node_id(n)] = map2[n - n1];
+			}
+			return *this;
+		}
+		template<typename SupplyMap>
+		NetworkSimplexSimple& supplyMapAll(SupplyMap val1, int n1, SupplyMap val2, int n2) {
+			Node n; _graph.first(n);
+			for (; n != INVALIDNODE; _graph.next(n)) {
+				if (n<n1)
+					_supply[_node_id(n)] = val1;
+				else
+					_supply[_node_id(n)] = val2;
+			}
+			return *this;
+		}
+
+		/// \brief Set single source and target nodes and a supply value.
+		///
+		/// This function sets a single source node and a single target node
+		/// and the required flow value.
+		/// If neither this function nor \ref supplyMap() is used before
+		/// calling \ref run(), the supply of each node will be set to zero.
+		///
+		/// Using this function has the same effect as using \ref supplyMap()
+		/// with such a map in which \c k is assigned to \c s, \c -k is
+		/// assigned to \c t and all other nodes have zero supply value.
+		///
+		/// \param s The source node.
+		/// \param t The target node.
+		/// \param k The required amount of flow from node \c s to node \c t
+		/// (i.e. the supply of \c s and the demand of \c t).
+		///
+		/// \return <tt>(*this)</tt>
+		NetworkSimplexSimple& stSupply(const Node& s, const Node& t, Value k) {
+			for (int i = 0; i != _node_num; ++i) {
+				_supply[i] = 0;
+			}
+			_supply[_node_id(s)] = k;
+			_supply[_node_id(t)] = -k;
+			return *this;
+		}
+
+		/// \brief Set the type of the supply constraints.
+		///
+		/// This function sets the type of the supply/demand constraints.
+		/// If it is not used before calling \ref run(), the \ref GEQ supply
+		/// type will be used.
+		///
+		/// For more information, see \ref SupplyType.
+		///
+		/// \return <tt>(*this)</tt>
+		NetworkSimplexSimple& supplyType(SupplyType supply_type) {
+			_stype = supply_type;
+			return *this;
+		}
+
+		/// @}
+
+		/// \name Execution Control
+		/// The algorithm can be executed using \ref run().
+
+		/// @{
+
+		/// \brief Run the algorithm.
+		///
+		/// This function runs the algorithm.
+		/// The paramters can be specified using functions \ref lowerMap(),
+		/// \ref upperMap(), \ref costMap(), \ref supplyMap(), \ref stSupply(),
+		/// \ref supplyType().
+		/// For example,
+		/// \code
+		///   NetworkSimplexSimple<ListDigraph> ns(graph);
+		///   ns.lowerMap(lower).upperMap(upper).costMap(cost)
+		///     .supplyMap(sup).run();
+		/// \endcode
+		///
+		/// This function can be called more than once. All the given parameters
+		/// are kept for the next call, unless \ref resetParams() or \ref reset()
+		/// is used, thus only the modified parameters have to be set again.
+		/// If the underlying digraph was also modified after the construction
+		/// of the class (or the last \ref reset() call), then the \ref reset()
+		/// function must be called.
+		///
+		/// \param pivot_rule The pivot rule that will be used during the
+		/// algorithm. For more information, see \ref PivotRule.
+		///
+		/// \return \c INFEASIBLE if no feasible flow exists,
+		/// \n \c OPTIMAL if the problem has optimal solution
+		/// (i.e. it is feasible and bounded), and the algorithm has found
+		/// optimal flow and node potentials (primal and dual solutions),
+		/// \n \c UNBOUNDED if the objective function of the problem is
+		/// unbounded, i.e. there is a directed cycle having negative total
+		/// cost and infinite upper bound.
+		///
+		/// \see ProblemType, PivotRule
+		/// \see resetParams(), reset()
+		ProblemType run() {
+#if DEBUG_LVL>0
+            		std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ;
+#endif
+			if (!init()) return INFEASIBLE;
+#if DEBUG_LVL>0
+			std::cout << "Init done, starting iterations\n";
+#endif
+
+			return start();
+		}
+
+		/// \brief Reset all the parameters that have been given before.
+		///
+		/// This function resets all the paramaters that have been given
+		/// before using functions \ref lowerMap(), \ref upperMap(),
+		/// \ref costMap(), \ref supplyMap(), \ref stSupply(), \ref supplyType().
+		///
+		/// It is useful for multiple \ref run() calls. Basically, all the given
+		/// parameters are kept for the next \ref run() call, unless
+		/// \ref resetParams() or \ref reset() is used.
+		/// If the underlying digraph was also modified after the construction
+		/// of the class or the last \ref reset() call, then the \ref reset()
+		/// function must be used, otherwise \ref resetParams() is sufficient.
+		///
+		/// For example,
+		/// \code
+		///   NetworkSimplexSimple<ListDigraph> ns(graph);
+		///
+		///   // First run
+		///   ns.lowerMap(lower).upperMap(upper).costMap(cost)
+		///     .supplyMap(sup).run();
+		///
+		///   // Run again with modified cost map (resetParams() is not called,
+		///   // so only the cost map have to be set again)
+		///   cost[e] += 100;
+		///   ns.costMap(cost).run();
+		///
+		///   // Run again from scratch using resetParams()
+		///   // (the lower bounds will be set to zero on all arcs)
+		///   ns.resetParams();
+		///   ns.upperMap(capacity).costMap(cost)
+		///     .supplyMap(sup).run();
+		/// \endcode
+		///
+		/// \return <tt>(*this)</tt>
+		///
+		/// \see reset(), run()
+		NetworkSimplexSimple& resetParams() {
+			for (int i = 0; i != _node_num; ++i) {
+				_supply[i] = 0;
+			}
+			for (ArcsType i = 0; i != _arc_num; ++i) {
+				_cost[i] = 1;
+			}
+			_stype = GEQ;
+			return *this;
+		}
+
+
+		/// \brief Reset the internal data structures and all the parameters
+		/// that have been given before.
+		///
+		/// This function resets the internal data structures and all the
+		/// paramaters that have been given before using functions \ref lowerMap(),
+		/// \ref upperMap(), \ref costMap(), \ref supplyMap(), \ref stSupply(),
+		/// \ref supplyType().
+		///
+		/// It is useful for multiple \ref run() calls. Basically, all the given
+		/// parameters are kept for the next \ref run() call, unless
+		/// \ref resetParams() or \ref reset() is used.
+		/// If the underlying digraph was also modified after the construction
+		/// of the class or the last \ref reset() call, then the \ref reset()
+		/// function must be used, otherwise \ref resetParams() is sufficient.
+		///
+		/// See \ref resetParams() for examples.
+		///
+		/// \return <tt>(*this)</tt>
+		///
+		/// \see resetParams(), run()
+		NetworkSimplexSimple& reset() {
+			// Resize vectors
+			_node_num = _init_nb_nodes;
+			_arc_num = _init_nb_arcs;
+			int all_node_num = _node_num + 1;
+			ArcsType max_arc_num = _arc_num + 2 * _node_num;
+
+			_source.resize(max_arc_num);
+			_target.resize(max_arc_num);
+
+			_cost.resize(max_arc_num);
+			_supply.resize(all_node_num);
+			_flow.resize(max_arc_num);
+			_pi.resize(all_node_num);
+
+			_parent.resize(all_node_num);
+			_pred.resize(all_node_num);
+			_forward.resize(all_node_num);
+			_thread.resize(all_node_num);
+			_rev_thread.resize(all_node_num);
+			_succ_num.resize(all_node_num);
+			_last_succ.resize(all_node_num);
+			_state.resize(max_arc_num);
+
+
+			//_arc_mixing=false;
+			if (_arc_mixing && _node_num > 1) {
+				// Store the arcs in a mixed order
+				//ArcsType k = std::max(ArcsType(std::sqrt(double(_arc_num))), ArcsType(10));
+				const ArcsType k = std::max(ArcsType(_arc_num / _node_num), ArcsType(3));
+				mixingCoeff = k;
+				subsequence_length = _arc_num / mixingCoeff + 1;
+				num_big_subsequences = _arc_num % mixingCoeff;
+				num_total_big_subsequence_numbers = subsequence_length * num_big_subsequences;
+
+#pragma omp parallel for schedule(static)
+				for (Arc a = 0; a <= _graph.maxArcId(); a++) {   // --a <=> _graph.next(a)  , -1 == INVALID 
+					ArcsType i = sequence(_graph.maxArcId()-a);
+					_source[i] = _node_id(_graph.source(a));
+					_target[i] = _node_id(_graph.target(a));
+				}
+			} else {
+				// Store the arcs in the original order
+				ArcsType i = 0;
+				Arc a; _graph.first(a);
+				for (; a != INVALID; _graph.next(a), ++i) {
+					_source[i] = _node_id(_graph.source(a));
+					_target[i] = _node_id(_graph.target(a));
+					//_arc_id[a] = i;
+				}
+			}
+
+			// Reset parameters
+			resetParams();
+			return *this;
+		}
+
+		/// @}
+
+		/// \name Query Functions
+		/// The results of the algorithm can be obtained using these
+		/// functions.\n
+		/// The \ref run() function must be called before using them.
+
+		/// @{
+
+		/// \brief Return the total cost of the found flow.
+		///
+		/// This function returns the total cost of the found flow.
+		/// Its complexity is O(e).
+		///
+		/// \note The return type of the function can be specified as a
+		/// template parameter. For example,
+		/// \code
+		///   ns.totalCost<double>();
+		/// \endcode
+		/// It is useful if the total cost cannot be stored in the \c Cost
+		/// type of the algorithm, which is the default return type of the
+		/// function.
+		///
+		/// \pre \ref run() must be called before using this function.
+		/*template <typename Number>
+		Number totalCost() const {
+		Number c = 0;
+		for (ArcIt a(_graph); a != INVALID; ++a) {
+		int i = getArcID(a);
+		c += Number(_flow[i]) * Number(_cost[i]);
+		}
+		return c;
+		}*/
+
+		template <typename Number>
+		Number totalCost() const {
+			Number c = 0;
+
+#ifdef SPARSE_FLOW
+		#ifdef HASHMAP
+			typename std::unordered_map<size_t, Value>::const_iterator it;
+		#else
+			typename std::map<size_t, Value>::const_iterator it;
+		#endif
+			for (it = _flow.data.begin(); it!=_flow.data.end(); ++it)
+				c += Number(it->second) * Number(_cost[it->first]);
+			return c;
+#else
+			for (ArcsType i = 0; i<_flow.size(); i++)
+				c += _flow[i] * Number(_cost[i]);
+			return c;
+#endif
+		}
+
+#ifndef DOXYGEN
+		Cost totalCost() const {
+			return totalCost<Cost>();
+		}
+#endif
+
+		/// \brief Return the flow on the given arc.
+		///
+		/// This function returns the flow on the given arc.
+		///
+		/// \pre \ref run() must be called before using this function.
+		Value flow(const Arc& a) const {
+			return _flow[getArcID(a)];
+		}
+
+		/// \brief Return the flow map (the primal solution).
+		///
+		/// This function copies the flow value on each arc into the given
+		/// map. The \c Value type of the algorithm must be convertible to
+		/// the \c Value type of the map.
+		///
+		/// \pre \ref run() must be called before using this function.
+		template <typename FlowMap>
+		void flowMap(FlowMap &map) const {
+			Arc a; _graph.first(a);
+			for (; a != INVALID; _graph.next(a)) {
+				map.set(a, _flow[getArcID(a)]);
+			}
+		}
+
+		/// \brief Return the potential (dual value) of the given node.
+		///
+		/// This function returns the potential (dual value) of the
+		/// given node.
+		///
+		/// \pre \ref run() must be called before using this function.
+		Cost potential(const Node& n) const {
+			return _pi[_node_id(n)];
+		}
+
+		/// \brief Return the potential map (the dual solution).
+		///
+		/// This function copies the potential (dual value) of each node
+		/// into the given map.
+		/// The \c Cost type of the algorithm must be convertible to the
+		/// \c Value type of the map.
+		///
+		/// \pre \ref run() must be called before using this function.
+		template <typename PotentialMap>
+		void potentialMap(PotentialMap &map) const {
+			Node n; _graph.first(n);
+			for (; n != INVALID; _graph.next(n)) {
+				map.set(n, _pi[_node_id(n)]);
+			}
+		}
+
+		/// @}
+
+	private:
+
+		// Initialize internal data structures
+		bool init() {
+			if (_node_num == 0) return false;
+
+			// Check the sum of supply values
+			_sum_supply = 0;
+			for (int i = 0; i != _node_num; ++i) {
+				_sum_supply += _supply[i];
+			}
+			/*if (!((_stype == GEQ && _sum_supply <= 0) ||
+				(_stype == LEQ && _sum_supply >= 0))) return false;*/
+
+
+			// Initialize artifical cost
+			Cost ART_COST;
+			if (std::numeric_limits<Cost>::is_exact) {
+				ART_COST = std::numeric_limits<Cost>::max() / 2 + 1;
+			} else {
+				ART_COST = 0;
+				for (ArcsType i = 0; i != _arc_num; ++i) {
+					if (_cost[i] > ART_COST) ART_COST = _cost[i];
+				}
+				ART_COST = (ART_COST + 1) * _node_num;
+			}
+
+			// Initialize arc maps
+			for (ArcsType i = 0; i != _arc_num; ++i) {
+#ifndef SPARSE_FLOW
+				_flow[i] = 0; //by default, the sparse matrix is empty
+#endif
+				_state[i] = STATE_LOWER;
+			}
+#ifdef SPARSE_FLOW
+			_flow = SparseValueVector<Value>();
+#endif
+
+			// Set data for the artificial root node
+			_root = _node_num;
+			_parent[_root] = -1;
+			_pred[_root] = -1;
+			_thread[_root] = 0;
+			_rev_thread[0] = _root;
+			_succ_num[_root] = _node_num + 1;
+			_last_succ[_root] = _root - 1;
+			_supply[_root] = -_sum_supply;
+			_pi[_root] = 0;
+
+			// Add artificial arcs and initialize the spanning tree data structure
+			if (_sum_supply == 0) {
+				// EQ supply constraints
+				_search_arc_num = _arc_num;
+				_all_arc_num = _arc_num + _node_num;
+				for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+					_parent[u] = _root;
+					_pred[u] = e;
+					_thread[u] = u + 1;
+					_rev_thread[u + 1] = u;
+					_succ_num[u] = 1;
+					_last_succ[u] = u;
+					_state[e] = STATE_TREE;
+					if (_supply[u] >= 0) {
+						_forward[u] = true;
+						_pi[u] = 0;
+						_source[e] = u;
+						_target[e] = _root;
+						_flow[e] = _supply[u];
+						_cost[e] = 0;
+					} else {
+						_forward[u] = false;
+						_pi[u] = ART_COST;
+						_source[e] = _root;
+						_target[e] = u;
+						_flow[e] = -_supply[u];
+						_cost[e] = ART_COST;
+					}
+				}
+			} else if (_sum_supply > 0) {
+				// LEQ supply constraints
+				_search_arc_num = _arc_num + _node_num;
+				ArcsType f = _arc_num + _node_num;
+				for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+					_parent[u] = _root;
+					_thread[u] = u + 1;
+					_rev_thread[u + 1] = u;
+					_succ_num[u] = 1;
+					_last_succ[u] = u;
+					if (_supply[u] >= 0) {
+						_forward[u] = true;
+						_pi[u] = 0;
+						_pred[u] = e;
+						_source[e] = u;
+						_target[e] = _root;
+						_flow[e] = _supply[u];
+						_cost[e] = 0;
+						_state[e] = STATE_TREE;
+					} else {
+						_forward[u] = false;
+						_pi[u] = ART_COST;
+						_pred[u] = f;
+						_source[f] = _root;
+						_target[f] = u;
+						_flow[f] = -_supply[u];
+						_cost[f] = ART_COST;
+						_state[f] = STATE_TREE;
+						_source[e] = u;
+						_target[e] = _root;
+						//_flow[e] = 0;  //by default, the sparse matrix is empty
+						_cost[e] = 0;
+						_state[e] = STATE_LOWER;
+						++f;
+					}
+				}
+				_all_arc_num = f;
+			} else {
+				// GEQ supply constraints
+				_search_arc_num = _arc_num + _node_num;
+				ArcsType f = _arc_num + _node_num;
+				for (ArcsType u = 0, e = _arc_num; u != _node_num; ++u, ++e) {
+					_parent[u] = _root;
+					_thread[u] = u + 1;
+					_rev_thread[u + 1] = u;
+					_succ_num[u] = 1;
+					_last_succ[u] = u;
+					if (_supply[u] <= 0) {
+						_forward[u] = false;
+						_pi[u] = 0;
+						_pred[u] = e;
+						_source[e] = _root;
+						_target[e] = u;
+						_flow[e] = -_supply[u];
+						_cost[e] = 0;
+						_state[e] = STATE_TREE;
+					} else {
+						_forward[u] = true;
+						_pi[u] = -ART_COST;
+						_pred[u] = f;
+						_source[f] = u;
+						_target[f] = _root;
+						_flow[f] = _supply[u];
+						_state[f] = STATE_TREE;
+						_cost[f] = ART_COST;
+						_source[e] = _root;
+						_target[e] = u;
+						//_flow[e] = 0; //by default, the sparse matrix is empty
+						_cost[e] = 0;
+						_state[e] = STATE_LOWER;
+						++f;
+					}
+				}
+				_all_arc_num = f;
+			}
+
+			return true;
+		}
+
+		// Find the join node
+		void findJoinNode() {
+			int u = _source[in_arc];
+			int v = _target[in_arc];
+			while (u != v) {
+				if (_succ_num[u] < _succ_num[v]) {
+					u = _parent[u];
+				} else {
+					v = _parent[v];
+				}
+			}
+			join = u;
+		}
+
+		// Find the leaving arc of the cycle and returns true if the
+		// leaving arc is not the same as the entering arc
+		bool findLeavingArc() {
+			// Initialize first and second nodes according to the direction
+			// of the cycle
+			if (_state[in_arc] == STATE_LOWER) {
+				first = _source[in_arc];
+				second = _target[in_arc];
+			} else {
+				first = _target[in_arc];
+				second = _source[in_arc];
+			}
+			delta = INF;
+			char result = 0;
+			Value d;
+			ArcsType e;
+
+			// Search the cycle along the path form the first node to the root
+			for (int u = first; u != join; u = _parent[u]) {
+				e = _pred[u];
+				d = _forward[u] ? _flow[e] : INF;
+				if (d < delta) {
+					delta = d;
+					u_out = u;
+					result = 1;
+				}
+			}
+			// Search the cycle along the path form the second node to the root
+			for (int u = second; u != join; u = _parent[u]) {
+				e = _pred[u];
+				d = _forward[u] ? INF : _flow[e];
+				if (d <= delta) {
+					delta = d;
+					u_out = u;
+					result = 2;
+				}
+			}
+
+			if (result == 1) {
+				u_in = first;
+				v_in = second;
+			} else {
+				u_in = second;
+				v_in = first;
+			}
+			return result != 0;
+		}
+
+		// Change _flow and _state vectors
+		void changeFlow(bool change) {
+			// Augment along the cycle
+			if (delta > 0) {
+				Value val = _state[in_arc] * delta;
+				_flow[in_arc] += val;
+				for (int u = _source[in_arc]; u != join; u = _parent[u]) {
+					_flow[_pred[u]] += _forward[u] ? -val : val;
+				}
+				for (int u = _target[in_arc]; u != join; u = _parent[u]) {
+					_flow[_pred[u]] += _forward[u] ? val : -val;
+				}
+			}
+			// Update the state of the entering and leaving arcs
+			if (change) {
+				_state[in_arc] = STATE_TREE;
+				_state[_pred[u_out]] =
+					(_flow[_pred[u_out]] == 0) ? STATE_LOWER : STATE_UPPER;
+			} else {
+				_state[in_arc] = -_state[in_arc];
+			}
+		}
+
+		// Update the tree structure
+		void updateTreeStructure() {
+			int old_rev_thread = _rev_thread[u_out];
+			int old_succ_num = _succ_num[u_out];
+			int old_last_succ = _last_succ[u_out];
+			v_out = _parent[u_out];
+
+			// Check if u_in and u_out coincide
+			if (u_in == u_out) {
+				// Update _parent, _pred, _pred_dir
+				_parent[u_in] = v_in;
+				_pred[u_in] = in_arc;
+				_forward[u_in] = (u_in == _source[in_arc]);
+
+				// Update _thread and _rev_thread
+				if (_thread[v_in] != u_out) {
+					ArcsType after = _thread[old_last_succ];
+					_thread[old_rev_thread] = after;
+					_rev_thread[after] = old_rev_thread;
+					after = _thread[v_in];
+					_thread[v_in] = u_out;
+					_rev_thread[u_out] = v_in;
+					_thread[old_last_succ] = after;
+					_rev_thread[after] = old_last_succ;
+				}
+			} else {
+				// Handle the case when old_rev_thread equals to v_in
+				// (it also means that join and v_out coincide)
+				int thread_continue = old_rev_thread == v_in ?
+					_thread[old_last_succ] : _thread[v_in];
+
+				// Update _thread and _parent along the stem nodes (i.e. the nodes
+				// between u_in and u_out, whose parent have to be changed)
+				int stem = u_in;              // the current stem node
+				int par_stem = v_in;          // the new parent of stem
+				int next_stem;                // the next stem node
+				int last = _last_succ[u_in];  // the last successor of stem
+				int before, after = _thread[last];
+				_thread[v_in] = u_in;
+				_dirty_revs.clear();
+				_dirty_revs.push_back(v_in);
+				while (stem != u_out) {
+					// Insert the next stem node into the thread list
+					next_stem = _parent[stem];
+					_thread[last] = next_stem;
+					_dirty_revs.push_back(last);
+
+					// Remove the subtree of stem from the thread list
+					before = _rev_thread[stem];
+					_thread[before] = after;
+					_rev_thread[after] = before;
+
+					// Change the parent node and shift stem nodes
+					_parent[stem] = par_stem;
+					par_stem = stem;
+					stem = next_stem;
+
+					// Update last and after
+					last = _last_succ[stem] == _last_succ[par_stem] ?
+						_rev_thread[par_stem] : _last_succ[stem];
+					after = _thread[last];
+				}
+				_parent[u_out] = par_stem;
+				_thread[last] = thread_continue;
+				_rev_thread[thread_continue] = last;
+				_last_succ[u_out] = last;
+
+				// Remove the subtree of u_out from the thread list except for
+				// the case when old_rev_thread equals to v_in
+				if (old_rev_thread != v_in) {
+					_thread[old_rev_thread] = after;
+					_rev_thread[after] = old_rev_thread;
+				}
+
+				// Update _rev_thread using the new _thread values
+				for (int i = 0; i != int(_dirty_revs.size()); ++i) {
+					int u = _dirty_revs[i];
+					_rev_thread[_thread[u]] = u;
+				}
+
+				// Update _pred, _pred_dir, _last_succ and _succ_num for the
+				// stem nodes from u_out to u_in
+				int tmp_sc = 0, tmp_ls = _last_succ[u_out];
+				for (int u = u_out, p = _parent[u]; u != u_in; u = p, p = _parent[u]) {
+					_pred[u] = _pred[p];
+					_forward[u] = !_forward[p];
+					tmp_sc += _succ_num[u] - _succ_num[p];
+					_succ_num[u] = tmp_sc;
+					_last_succ[p] = tmp_ls;
+				}
+				_pred[u_in] = in_arc;
+				_forward[u_in] = (u_in == _source[in_arc]);
+				_succ_num[u_in] = old_succ_num;
+			}
+
+			// Update _last_succ from v_in towards the root
+			int up_limit_out = _last_succ[join] == v_in ? join : -1;
+			int last_succ_out = _last_succ[u_out];
+			for (int u = v_in; u != -1 && _last_succ[u] == v_in; u = _parent[u]) {
+				_last_succ[u] = last_succ_out;
+			}
+
+			// Update _last_succ from v_out towards the root
+			if (join != old_rev_thread && v_in != old_rev_thread) {
+				for (int u = v_out; u != up_limit_out && _last_succ[u] == old_last_succ;
+					u = _parent[u]) {
+					_last_succ[u] = old_rev_thread;
+				}
+			} else if (last_succ_out != old_last_succ) {
+				for (int u = v_out; u != up_limit_out && _last_succ[u] == old_last_succ;
+					u = _parent[u]) {
+					_last_succ[u] = last_succ_out;
+				}
+			}
+
+			// Update _succ_num from v_in to join
+			for (int u = v_in; u != join; u = _parent[u]) {
+				_succ_num[u] += old_succ_num;
+			}
+			// Update _succ_num from v_out to join
+			for (int u = v_out; u != join; u = _parent[u]) {
+				_succ_num[u] -= old_succ_num;
+			}
+		}
+
+		void updatePotential() {
+			Cost sigma = _pi[v_in] - _pi[u_in] -
+				((_forward[u_in])?_cost[in_arc]:(-_cost[in_arc]));
+			int end = _thread[_last_succ[u_in]];
+			for (int u = u_in; u != end; u = _thread[u]) {
+				_pi[u] += sigma;
+			}
+		}
+		
+
+		// Heuristic initial pivots
+		bool initialPivots() {
+			Value curr, total = 0;
+			std::vector<Node> supply_nodes, demand_nodes;
+			Node u; _graph.first(u);
+			for (; u != INVALIDNODE; _graph.next(u)) {
+				curr = _supply[_node_id(u)];
+				if (curr > 0) {
+					total += curr;
+					supply_nodes.push_back(u);
+				} else if (curr < 0) {
+					demand_nodes.push_back(u);
+				}
+			}
+			if (_sum_supply > 0) total -= _sum_supply;
+			if (total <= 0) return true;
+
+			ArcVector arc_vector;
+			if (_sum_supply >= 0) {
+				if (supply_nodes.size() == 1 && demand_nodes.size() == 1) {
+					// Perform a reverse graph search from the sink to the source
+					//typename GR::template NodeMap<bool> reached(_graph, false);
+					BoolVector reached(_node_num, false);
+					Node s = supply_nodes[0], t = demand_nodes[0];
+					std::vector<Node> stack;
+					reached[t] = true;
+					stack.push_back(t);
+					while (!stack.empty()) {
+						Node u, v = stack.back();
+						stack.pop_back();
+						if (v == s) break;
+						Arc a; _graph.firstIn(a, v);
+						for (; a != INVALID; _graph.nextIn(a)) {
+							if (reached[u = _graph.source(a)]) continue;
+							ArcsType j = getArcID(a);
+							arc_vector.push_back(j);
+							reached[u] = true;
+							stack.push_back(u);
+						}
+					}
+				} else {
+					arc_vector.resize(demand_nodes.size());
+					// Find the min. cost incomming arc for each demand node
+#pragma omp parallel for
+					for (int i = 0; i < demand_nodes.size(); ++i) {
+						Node v = demand_nodes[i];
+						Cost min_cost = std::numeric_limits<Cost>::max();
+						Arc min_arc = INVALID;
+						Arc a; _graph.firstIn(a, v);
+						for (; a != INVALID; _graph.nextIn(a)) {
+							Cost c = _cost[getArcID(a)];
+							if (c < min_cost) {
+								min_cost = c;
+								min_arc = a;
+							}
+						}
+						arc_vector[i] = getArcID(min_arc);
+					}
+					arc_vector.erase(std::remove(arc_vector.begin(), arc_vector.end(), INVALID), arc_vector.end());
+				}
+			} else {
+				arc_vector.resize(supply_nodes.size());
+				// Find the min. cost outgoing arc for each supply node
+#pragma omp parallel for
+				for (int i = 0; i < int(supply_nodes.size()); ++i) {
+					Node u = supply_nodes[i];
+					Cost min_cost = std::numeric_limits<Cost>::max();
+					Arc min_arc = INVALID;
+					Arc a; _graph.firstOut(a, u);
+					for (; a != INVALID; _graph.nextOut(a)) {
+						Cost c = _cost[getArcID(a)];
+						if (c < min_cost) {
+							min_cost = c;
+							min_arc = a;
+						}
+					}
+					arc_vector[i] = getArcID(min_arc);
+				}
+				arc_vector.erase(std::remove(arc_vector.begin(), arc_vector.end(), INVALID), arc_vector.end());
+			}
+
+			// Perform heuristic initial pivots
+			for (ArcsType i = 0; i != ArcsType(arc_vector.size()); ++i) {
+				in_arc = arc_vector[i];
+				if (_state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -
+					_pi[_target[in_arc]]) >= 0) continue;
+				findJoinNode();
+				bool change = findLeavingArc();
+				if (delta >= MAX) return false;
+				changeFlow(change);
+				if (change) {
+					updateTreeStructure();
+					updatePotential();
+				}
+			}
+			return true;
+		}
+
+		// Execute the algorithm
+		ProblemType start() {
+			return start<BlockSearchPivotRule>();
+		}
+
+		template <typename PivotRuleImpl>
+		ProblemType start() {
+			PivotRuleImpl pivot(*this);
+			ProblemType retVal = OPTIMAL;
+
+			// Perform heuristic initial pivots
+			if (!initialPivots()) return UNBOUNDED;
+
+			size_t iter_number = 0;
+			// Execute the Network Simplex algorithm
+			while (pivot.findEnteringArc()) {
+				if ((++iter_number <= max_iter&&max_iter > 0) || max_iter<=0) {
+#if DEBUG_LVL>0
+					if(iter_number>MAX_DEBUG_ITER)
+						break;
+					if(iter_number%1000==0||iter_number%1000==1){
+						Cost curCost=totalCost();
+						Value sumFlow=0;
+						Cost a;
+						a= (fabs(_pi[_source[in_arc]])>=fabs(_pi[_target[in_arc]])) ? fabs(_pi[_source[in_arc]]) : fabs(_pi[_target[in_arc]]);
+						a=a>=fabs(_cost[in_arc])?a:fabs(_cost[in_arc]);
+						for (int i=0; i<_flow.size(); i++) {
+							sumFlow+=_state[i]*_flow[i];
+						}
+						std::cout << "Sum of the flow " << std::setprecision(20) << sumFlow << "\n" << iter_number << " iterations, current cost=" << curCost << "\nReduced cost=" << _state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -_pi[_target[in_arc]]) << "\nPrecision = "<< -EPSILON*(a) << "\n";
+						std::cout << "Arc in = (" << _node_id(_source[in_arc]) << ", " << _node_id(_target[in_arc]) <<")\n";
+						std::cout << "Supplies = (" << _supply[_source[in_arc]] << ", " << _supply[_target[in_arc]] << ")\n";
+						std::cout << _cost[in_arc] << "\n";
+						std::cout << _pi[_source[in_arc]] << "\n";
+						std::cout << _pi[_target[in_arc]] << "\n";
+						std::cout << a << "\n";
+					}
+#endif
+
+					findJoinNode();
+					bool change = findLeavingArc();
+					if (delta >= MAX) return UNBOUNDED;
+					changeFlow(change);
+					if (change) {
+						updateTreeStructure();
+						updatePotential();
+					}
+
+#if DEBUG_LVL>0
+			                else{
+						std::cout << "No change\n";
+					}
+#endif
+
+#if DEBUG_LVL>1
+					std::cout << "Arc in = (" << _source[in_arc] << ", " << _target[in_arc] << ")\n";
+#endif
+
+
+				} else {
+					char errMess[1000];
+					sprintf( errMess, "RESULT MIGHT BE INACURATE\nMax number of iteration reached, currently \%d. Sometimes iterations go on in cycle even though the solution has been reached, to check if it's the case here have a look at the minimal reduced cost. If it is very close to machine precision, you might actually have the correct solution, if not try setting the maximum number of iterations a bit higher\n",iter_number );
+					std::cerr << errMess;
+					retVal =  MAX_ITER_REACHED;
+					break;
+				}
+
+			}
+
+
+
+#if DEBUG_LVL>0
+                Cost curCost=totalCost();
+                Value sumFlow=0;
+                Cost a;
+                a= (fabs(_pi[_source[in_arc]])>=fabs(_pi[_target[in_arc]])) ? fabs(_pi[_source[in_arc]]) : fabs(_pi[_target[in_arc]]);
+                a=a>=fabs(_cost[in_arc])?a:fabs(_cost[in_arc]);
+                for (int i=0; i<_flow.size(); i++) {
+                    sumFlow+=_state[i]*_flow[i];
+                }
+
+                std::cout << "Sum of the flow " << std::setprecision(20) << sumFlow << "\n" << niter << " iterations, current cost=" << curCost << "\nReduced cost=" << _state[in_arc] * (_cost[in_arc] + _pi[_source[in_arc]] -_pi[_target[in_arc]]) << "\nPrecision = "<< -EPSILON*(a) << "\n";
+
+                std::cout << "Arc in = (" << _node_id(_source[in_arc]) << ", " << _node_id(_target[in_arc]) <<")\n";
+                std::cout << "Supplies = (" << _supply[_source[in_arc]] << ", " << _supply[_target[in_arc]] << ")\n";
+
+#endif
+
+
+
+#if DEBUG_LVL>1
+			sumFlow=0;
+			for (int i=0; i<_flow.size(); i++) {
+				sumFlow+=_state[i]*_flow[i];
+				if (_state[i]==STATE_TREE) {
+					std::cout << "Non zero value at (" << _node_num+1-_source[i] << ", " << _node_num+1-_target[i] << ")\n";
+				}
+			}
+			std::cout << "Sum of the flow " << sumFlow << "\n"<< niter <<" iterations, current cost=" << totalCost() << "\n";
+#endif
+
+
+
+			//Check feasibility
+			if(retVal == OPTIMAL){
+				for (ArcsType e = _search_arc_num; e != _all_arc_num; ++e) {
+					if (_flow[e] != 0){
+						if (fabs(_flow[e]) > _EPSILON) // change of the original code following issue #126
+							return INFEASIBLE;
+						else
+							_flow[e]=0;
+					}
+				}
+			}
+
+			// Shift potentials to meet the requirements of the GEQ/LEQ type
+			// optimality conditions
+			if (_sum_supply == 0) {
+				if (_stype == GEQ) {
+					Cost max_pot = -std::numeric_limits<Cost>::max();
+					for (ArcsType i = 0; i != _node_num; ++i) {
+						if (_pi[i] > max_pot) max_pot = _pi[i];
+					}
+					if (max_pot > 0) {
+						for (ArcsType i = 0; i != _node_num; ++i)
+							_pi[i] -= max_pot;
+					}
+				} else {
+					Cost min_pot = std::numeric_limits<Cost>::max();
+					for (ArcsType i = 0; i != _node_num; ++i) {
+						if (_pi[i] < min_pot) min_pot = _pi[i];
+					}
+					if (min_pot < 0) {
+						for (ArcsType i = 0; i != _node_num; ++i)
+							_pi[i] -= min_pot;
+					}
+				}
+			}
+
+			return retVal;
+		}
+
+	}; //class NetworkSimplexSimple
+
+	   ///@}
+
+} //namespace lemon_omp
diff --git a/ot/lp/solver_1d.py b/ot/lp/solver_1d.py
new file mode 100644
index 0000000..8b4d0c3
--- /dev/null
+++ b/ot/lp/solver_1d.py
@@ -0,0 +1,367 @@
+# -*- coding: utf-8 -*-
+"""
+Exact solvers for the 1D Wasserstein distance using cvxopt
+"""
+
+# Author: Remi Flamary <remi.flamary@unice.fr>
+# Author: Nicolas Courty <ncourty@irisa.fr>
+#
+# License: MIT License
+
+import numpy as np
+import warnings
+
+from .emd_wrap import emd_1d_sorted
+from ..backend import get_backend
+from ..utils import list_to_array
+
+
+def quantile_function(qs, cws, xs):
+    r""" Computes the quantile function of an empirical distribution
+
+    Parameters
+    ----------
+    qs: array-like, shape (n,)
+        Quantiles at which the quantile function is evaluated
+    cws: array-like, shape (m, ...)
+        cumulative weights of the 1D empirical distribution, if batched, must be similar to xs
+    xs: array-like, shape (n, ...)
+        locations of the 1D empirical distribution, batched against the `xs.ndim - 1` first dimensions
+
+    Returns
+    -------
+    q: array-like, shape (..., n)
+        The quantiles of the distribution
+    """
+    nx = get_backend(qs, cws)
+    n = xs.shape[0]
+    if nx.__name__ == 'torch':
+        # this is to ensure the best performance for torch searchsorted
+        # and avoid a warninng related to non-contiguous arrays
+        cws = cws.T.contiguous()
+        qs = qs.T.contiguous()
+    else:
+        cws = cws.T
+        qs = qs.T
+    idx = nx.searchsorted(cws, qs).T
+    return nx.take_along_axis(xs, nx.clip(idx, 0, n - 1), axis=0)
+
+
+def wasserstein_1d(u_values, v_values, u_weights=None, v_weights=None, p=1, require_sort=True):
+    r"""
+    Computes the 1 dimensional OT loss [15] between two (batched) empirical
+    distributions
+
+    .. math:
+        OT_{loss} = \int_0^1 |cdf_u^{-1}(q)  cdf_v^{-1}(q)|^p dq
+
+    It is formally the p-Wasserstein distance raised to the power p.
+    We do so in a vectorized way by first building the individual quantile functions then integrating them.
+
+    This function should be preferred to `emd_1d` whenever the backend is
+    different to numpy, and when gradients over
+    either sample positions or weights are required.
+
+    Parameters
+    ----------
+    u_values: array-like, shape (n, ...)
+        locations of the first empirical distribution
+    v_values: array-like, shape (m, ...)
+        locations of the second empirical distribution
+    u_weights: array-like, shape (n, ...), optional
+        weights of the first empirical distribution, if None then uniform weights are used
+    v_weights: array-like, shape (m, ...), optional
+        weights of the second empirical distribution, if None then uniform weights are used
+    p: int, optional
+        order of the ground metric used, should be at least 1 (see [2, Chap. 2], default is 1
+    require_sort: bool, optional
+        sort the distributions atoms locations, if False we will consider they have been sorted prior to being passed to
+        the function, default is True
+
+    Returns
+    -------
+    cost: float/array-like, shape (...)
+        the batched EMD
+
+    References
+    ----------
+    .. [15] Peyré, G., & Cuturi, M. (2018). Computational Optimal Transport.
+
+    """
+
+    assert p >= 1, "The OT loss is only valid for p>=1, {p} was given".format(p=p)
+
+    if u_weights is not None and v_weights is not None:
+        nx = get_backend(u_values, v_values, u_weights, v_weights)
+    else:
+        nx = get_backend(u_values, v_values)
+
+    n = u_values.shape[0]
+    m = v_values.shape[0]
+
+    if u_weights is None:
+        u_weights = nx.full(u_values.shape, 1. / n)
+    elif u_weights.ndim != u_values.ndim:
+        u_weights = nx.repeat(u_weights[..., None], u_values.shape[-1], -1)
+    if v_weights is None:
+        v_weights = nx.full(v_values.shape, 1. / m)
+    elif v_weights.ndim != v_values.ndim:
+        v_weights = nx.repeat(v_weights[..., None], v_values.shape[-1], -1)
+
+    if require_sort:
+        u_sorter = nx.argsort(u_values, 0)
+        u_values = nx.take_along_axis(u_values, u_sorter, 0)
+
+        v_sorter = nx.argsort(v_values, 0)
+        v_values = nx.take_along_axis(v_values, v_sorter, 0)
+
+        u_weights = nx.take_along_axis(u_weights, u_sorter, 0)
+        v_weights = nx.take_along_axis(v_weights, v_sorter, 0)
+
+    u_cumweights = nx.cumsum(u_weights, 0)
+    v_cumweights = nx.cumsum(v_weights, 0)
+
+    qs = nx.sort(nx.concatenate((u_cumweights, v_cumweights), 0), 0)
+    u_quantiles = quantile_function(qs, u_cumweights, u_values)
+    v_quantiles = quantile_function(qs, v_cumweights, v_values)
+    qs = nx.zero_pad(qs, pad_width=[(1, 0)] + (qs.ndim - 1) * [(0, 0)])
+    delta = qs[1:, ...] - qs[:-1, ...]
+    diff_quantiles = nx.abs(u_quantiles - v_quantiles)
+
+    if p == 1:
+        return nx.sum(delta * nx.abs(diff_quantiles), axis=0)
+    return nx.sum(delta * nx.power(diff_quantiles, p), axis=0)
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+           log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the OT matrix
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the cost.
+        Otherwise returns only the optimal transportation matrix.
+
+    Returns
+    -------
+    gamma: (ns, nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log: dict
+        If input log is True, a dictionary containing the cost
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd_1d(x_a, x_b, a, b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+    >>> ot.emd_1d(x_a, x_b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd : EMD for multidimensional distributions
+    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+        transportation matrix)
+    """
+    a, b, x_a, x_b = list_to_array(a, b, x_a, x_b)
+    nx = get_backend(x_a, x_b)
+
+    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+
+    # if empty array given then use uniform distributions
+    if a is None or a.ndim == 0 or len(a) == 0:
+        a = nx.ones((x_a.shape[0],), type_as=x_a) / x_a.shape[0]
+    if b is None or b.ndim == 0 or len(b) == 0:
+        b = nx.ones((x_b.shape[0],), type_as=x_b) / x_b.shape[0]
+
+    # ensure that same mass
+    np.testing.assert_almost_equal(
+        nx.to_numpy(nx.sum(a, axis=0)),
+        nx.to_numpy(nx.sum(b, axis=0)),
+        err_msg='a and b vector must have the same sum'
+    )
+    b = b * nx.sum(a) / nx.sum(b)
+
+    x_a_1d = nx.reshape(x_a, (-1,))
+    x_b_1d = nx.reshape(x_b, (-1,))
+    perm_a = nx.argsort(x_a_1d)
+    perm_b = nx.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(
+        nx.to_numpy(a[perm_a]).astype(np.float64),
+        nx.to_numpy(b[perm_b]).astype(np.float64),
+        nx.to_numpy(x_a_1d[perm_a]).astype(np.float64),
+        nx.to_numpy(x_b_1d[perm_b]).astype(np.float64),
+        metric=metric, p=p
+    )
+
+    G = nx.coo_matrix(
+        G_sorted,
+        perm_a[indices[:, 0]],
+        perm_b[indices[:, 1]],
+        shape=(a.shape[0], b.shape[0]),
+        type_as=x_a
+    )
+    if dense:
+        G = nx.todense(G)
+    elif str(nx) == "jax":
+        warnings.warn("JAX does not support sparse matrices, converting to dense")
+    if log:
+        log = {'cost': nx.from_numpy(cost, type_as=x_a)}
+        return G, log
+    return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+            log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the loss
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Only used if log is set to True. Due to implementation details,
+        this function runs faster when dense is set to False.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the transportation matrix.
+        Otherwise returns only the loss.
+
+    Returns
+    -------
+    loss: float
+        Cost associated to the optimal transportation
+    log: dict
+        If input log is True, a dictionary containing the Optimal transportation
+        matrix for the given parameters
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd2_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd2_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.emd2_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd2 : EMD for multidimensional distributions
+    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+        instead of the cost)
+    """
+    # If we do not return G (log==False), then we should not to cast it to dense
+    # (useless overhead)
+    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+                        dense=dense and log, log=True)
+    cost = log_emd['cost']
+    if log:
+        log_emd = {'G': G}
+        return cost, log_emd
+    return cost
author	Gard Spreemann <gspr@nonempty.org>	2021-11-09 17:05:13 +0100
committer	Gard Spreemann <gspr@nonempty.org>	2021-11-09 17:05:13 +0100
commit	a9fdc844907decddf54bed3ebeea8d8b2cf0fc5c (patch)
tree	449a03fce8fafb78b6badd12b6e633f1e5d73a64 /ot/lp
parent	a16b9471d7114ec08977479b7249efe747702b97 (diff)
parent	f1628794d521a8dfa00af383b5e06cd6d34af619 (diff)