1 files changed, 324 insertions, 27 deletions
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index 02cbd8c..17f1731 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """
 Solvers for the original linear program OT problem
+
+
+
 """
 
 # Author: Remi Flamary <remi.flamary@unice.fr>
@@ -10,20 +13,22 @@ Solvers for the original linear program OT problem
 import multiprocessing
 
 import numpy as np
+from scipy.sparse import coo_matrix
 
 from .import cvx
 
 # import compiled emd
-from .emd_wrap import emd_c, check_result
+from .emd_wrap import emd_c, check_result, emd_1d_sorted
 from ..utils import parmap
 from .cvx import barycenter
 from ..utils import dist
 
-__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx']
+__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
+         'emd_1d', 'emd2_1d', 'wasserstein_1d']
 
 
 def emd(a, b, M, numItermax=100000, log=False):
-    """Solves the Earth Movers distance problem and returns the OT matrix
+    r"""Solves the Earth Movers distance problem and returns the OT matrix
 
 
     .. math::
@@ -37,26 +42,30 @@ def emd(a, b, M, numItermax=100000, log=False):
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
-    log: boolean, optional (default=False)
+    log: bool, optional (default=False)
         If True, returns a dictionary containing the cost and dual
         variables. Otherwise returns only the optimal transportation matrix.
 
     Returns
     -------
-    gamma: (ns x nt) ndarray
+    gamma: (ns x nt) numpy.ndarray
         Optimal transportation matrix for the given parameters
     log: dict
         If input log is true, a dictionary containing the cost and dual
@@ -74,8 +83,8 @@ def emd(a, b, M, numItermax=100000, log=False):
     >>> b=[.5,.5]
     >>> M=[[0.,1.],[1.,0.]]
     >>> ot.emd(a,b,M)
-    array([[ 0.5,  0. ],
-           [ 0. ,  0.5]])
+    array([[0.5, 0. ],
+           [0. , 0.5]])
 
     References
     ----------
@@ -94,7 +103,7 @@ def emd(a, b, M, numItermax=100000, log=False):
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -115,7 +124,7 @@ def emd(a, b, M, numItermax=100000, log=False):
 
 def emd2(a, b, M, processes=multiprocessing.cpu_count(),
          numItermax=100000, log=False, return_matrix=False):
-    """Solves the Earth Movers distance problem and returns the loss
+    r"""Solves the Earth Movers distance problem and returns the loss
 
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F
@@ -128,16 +137,20 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     - M is the metric cost matrix
     - a and b are the sample weights
 
+    .. warning::
+        Note that the M matrix needs to be a C-order numpy.array in float64 
+        format.
+
     Uses the algorithm proposed in [1]_
 
     Parameters
     ----------
-    a : (ns,) ndarray, float64
-        Source histogram (uniform weigth if empty list)
-    b : (nt,) ndarray, float64
-        Target histogram (uniform weigth if empty list)
-    M : (ns,nt) ndarray, float64
-        loss matrix
+    a : (ns,) numpy.ndarray, float64
+        Source histogram (uniform weight if empty list)
+    b : (nt,) numpy.ndarray, float64
+        Target histogram (uniform weight if empty list)
+    M : (ns,nt) numpy.ndarray, float64
+        Loss matrix (c-order array with type float64)
     numItermax : int, optional (default=100000)
         The maximum number of iterations before stopping the optimization
         algorithm if it has not converged.
@@ -151,7 +164,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     -------
     gamma: (ns x nt) ndarray
         Optimal transportation matrix for the given parameters
-    log: dict
+    log: dictnp
         If input log is true, a dictionary containing the cost and dual
         variables and exit status
 
@@ -187,7 +200,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(),
     b = np.asarray(b, dtype=np.float64)
     M = np.asarray(M, dtype=np.float64)
 
-    # if empty array given then use unifor distributions
+    # if empty array given then use uniform distributions
     if len(a) == 0:
         a = np.ones((M.shape[0],), dtype=np.float64) / M.shape[0]
     if len(b) == 0:
@@ -231,9 +244,9 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
 
     Parameters
     ----------
-    measures_locations : list of (k_i,d) np.ndarray
+    measures_locations : list of (k_i,d) numpy.ndarray
         The discrete support of a measure supported on k_i locations of a d-dimensional space (k_i can be different for each element of the list)
-    measures_weights : list of (k_i,) np.ndarray
+    measures_weights : list of (k_i,) numpy.ndarray
         Numpy arrays where each numpy array has k_i non-negatives values summing to one representing the weights of each discrete input measure
 
     X_init : (k,d) np.ndarray
@@ -246,7 +259,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
     numItermax : int, optional
         Max number of iterations
     stopThr : float, optional
-        Stop threshol on error (>0)
+        Stop threshold on error (>0)
     verbose : bool, optional
         Print information along iterations
     log : bool, optional
@@ -308,4 +321,288 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
         log_dict['displacement_square_norms'] = displacement_square_norms
         return X, log_dict
     else:
-        return X
-\ No newline at end of file
+        return X
+
+
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+           log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the OT matrix
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'cityblock'`,  or `'euclidean'` metrics are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the cost.
+        Otherwise returns only the optimal transportation matrix.
+
+    Returns
+    -------
+    gamma: (ns, nt) ndarray
+        Optimal transportation matrix for the given parameters
+    log: dict
+        If input log is True, a dictionary containing the cost
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd_1d(x_a, x_b, a, b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+    >>> ot.emd_1d(x_a, x_b)
+    array([[0. , 0.5],
+           [0.5, 0. ]])
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd : EMD for multidimensional distributions
+    ot.lp.emd2_1d : EMD for 1d distributions (returns cost instead of the
+        transportation matrix)
+    """
+    a = np.asarray(a, dtype=np.float64)
+    b = np.asarray(b, dtype=np.float64)
+    x_a = np.asarray(x_a, dtype=np.float64)
+    x_b = np.asarray(x_b, dtype=np.float64)
+
+    assert (x_a.ndim == 1 or x_a.ndim == 2 and x_a.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+    assert (x_b.ndim == 1 or x_b.ndim == 2 and x_b.shape[1] == 1), \
+        "emd_1d should only be used with monodimensional data"
+
+    # if empty array given then use uniform distributions
+    if a.ndim == 0 or len(a) == 0:
+        a = np.ones((x_a.shape[0],), dtype=np.float64) / x_a.shape[0]
+    if b.ndim == 0 or len(b) == 0:
+        b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0]
+
+    x_a_1d = x_a.reshape((-1, ))
+    x_b_1d = x_b.reshape((-1, ))
+    perm_a = np.argsort(x_a_1d)
+    perm_b = np.argsort(x_b_1d)
+
+    G_sorted, indices, cost = emd_1d_sorted(a, b,
+                                            x_a_1d[perm_a], x_b_1d[perm_b],
+                                            metric=metric, p=p)
+    G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
+                   shape=(a.shape[0], b.shape[0]))
+    if dense:
+        G = G.toarray()
+    if log:
+        log = {'cost': cost}
+        return G, log
+    return G
+
+
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+            log=False):
+    r"""Solves the Earth Movers distance problem between 1d measures and returns
+    the loss
+
+
+    .. math::
+        \gamma = arg\min_\gamma \sum_i \sum_j \gamma_{ij} d(x_a[i], x_b[j])
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+    where :
+
+    - d is the metric
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    metric: str, optional (default='sqeuclidean')
+        Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
+        Due to implementation details, this function runs faster when
+        `'sqeuclidean'`, `'minkowski'`, `'cityblock'`,  or `'euclidean'` metrics
+        are used.
+    p: float, optional (default=1.0)
+         The p-norm to apply for if metric='minkowski'
+    dense: boolean, optional (default=True)
+        If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+        Otherwise returns a sparse representation using scipy's `coo_matrix`
+        format. Only used if log is set to True. Due to implementation details,
+        this function runs faster when dense is set to False.
+    log: boolean, optional (default=False)
+        If True, returns a dictionary containing the transportation matrix.
+        Otherwise returns only the loss.
+
+    Returns
+    -------
+    loss: float
+        Cost associated to the optimal transportation
+    log: dict
+        If input log is True, a dictionary containing the Optimal transportation
+        matrix for the given parameters
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function emd2_1d accepts lists and
+    performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.emd2_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.emd2_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd2 : EMD for multidimensional distributions
+    ot.lp.emd_1d : EMD for 1d distributions (returns the transportation matrix
+        instead of the cost)
+    """
+    # If we do not return G (log==False), then we should not to cast it to dense
+    # (useless overhead)
+    G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
+                        dense=dense and log, log=True)
+    cost = log_emd['cost']
+    if log:
+        log_emd = {'G': G}
+        return cost, log_emd
+    return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1.):
+    r"""Solves the p-Wasserstein distance problem between 1d measures and returns
+    the distance
+
+    .. math::
+        \min_\gamma \left( \sum_i \sum_j \gamma_{ij} \|x_a[i] - x_b[j]\|^p \right)^{1/p}
+
+        s.t. \gamma 1 = a,
+             \gamma^T 1= b,
+             \gamma\geq 0
+
+    where :
+
+    - x_a and x_b are the samples
+    - a and b are the sample weights
+
+    Uses the algorithm detailed in [1]_
+
+    Parameters
+    ----------
+    x_a : (ns,) or (ns, 1) ndarray, float64
+        Source dirac locations (on the real line)
+    x_b : (nt,) or (ns, 1) ndarray, float64
+        Target dirac locations (on the real line)
+    a : (ns,) ndarray, float64, optional
+        Source histogram (default is uniform weight)
+    b : (nt,) ndarray, float64, optional
+        Target histogram (default is uniform weight)
+    p: float, optional (default=1.0)
+         The order of the p-Wasserstein distance to be computed
+
+    Returns
+    -------
+    dist: float
+        p-Wasserstein distance
+
+
+    Examples
+    --------
+
+    Simple example with obvious solution. The function wasserstein_1d accepts
+    lists and performs automatic conversion to numpy arrays
+
+    >>> import ot
+    >>> a=[.5, .5]
+    >>> b=[.5, .5]
+    >>> x_a = [2., 0.]
+    >>> x_b = [0., 3.]
+    >>> ot.wasserstein_1d(x_a, x_b, a, b)
+    0.5
+    >>> ot.wasserstein_1d(x_a, x_b)
+    0.5
+
+    References
+    ----------
+
+    .. [1]  Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+        Transport", 2018.
+
+    See Also
+    --------
+    ot.lp.emd_1d : EMD for 1d distributions
+    """
+    cost_emd = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+                       dense=False, log=False)
+    return np.power(cost_emd, 1. / p)