summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ot/lp/__init__.py203
-rw-r--r--ot/lp/emd_wrap.pyx10
2 files changed, 203 insertions, 10 deletions
diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py
index bf218d3..719032b 100644
--- a/ot/lp/__init__.py
+++ b/ot/lp/__init__.py
@@ -21,7 +21,7 @@ from .cvx import barycenter
from ..utils import dist
__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx',
- 'emd_1d', 'emd2_1d']
+ 'emd_1d', 'emd2_1d', 'wasserstein_1d', 'wasserstein2_1d']
def emd(a, b, M, numItermax=100000, log=False):
@@ -313,7 +313,8 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None
return X
-def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False):
+def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+ log=False):
"""Solves the Earth Movers distance problem between 1d measures and returns
the OT matrix
@@ -330,6 +331,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False
- x_a and x_b are the samples
- a and b are the sample weights
+ When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
Uses the algorithm detailed in [1]_
Parameters
@@ -346,11 +349,14 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False
Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
Due to implementation details, this function runs faster when
`'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
dense: boolean, optional (default=True)
If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
Otherwise returns a sparse representation using scipy's `coo_matrix`
format. Due to implementation details, this function runs faster when
- dense is set to False.
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
log: boolean, optional (default=False)
If True, returns a dictionary containing the cost.
Otherwise returns only the optimal transportation matrix.
@@ -416,7 +422,7 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False
G_sorted, indices, cost = emd_1d_sorted(a, b,
x_a_1d[perm_a], x_b_1d[perm_b],
- metric=metric)
+ metric=metric, p=p)
G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])),
shape=(a.shape[0], b.shape[0]))
if dense:
@@ -427,7 +433,8 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False
return G
-def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=False):
+def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True,
+ log=False):
"""Solves the Earth Movers distance problem between 1d measures and returns
the loss
@@ -444,6 +451,8 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals
- x_a and x_b are the samples
- a and b are the sample weights
+ When 'minkowski' is used as a metric, :math:`d(x, y) = |x - y|^p`.
+
Uses the algorithm detailed in [1]_
Parameters
@@ -459,7 +468,10 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals
metric: str, optional (default='sqeuclidean')
Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
Due to implementation details, this function runs faster when
- `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used.
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
dense: boolean, optional (default=True)
If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
Otherwise returns a sparse representation using scipy's `coo_matrix`
@@ -508,10 +520,185 @@ def emd2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', dense=True, log=Fals
"""
# If we do not return G (log==False), then we should not to cast it to dense
# (useless overhead)
- G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric,
+ G, log_emd = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric=metric, p=p,
dense=dense and log, log=True)
cost = log_emd['cost']
if log:
log_emd = {'G': G}
return cost, log_emd
- return cost \ No newline at end of file
+ return cost
+
+
+def wasserstein_1d(x_a, x_b, a=None, b=None, p=1., dense=True, log=False):
+ """Solves the Wasserstein distance problem between 1d measures and returns
+ the OT matrix
+
+
+ .. math::
+ \gamma = arg\min_\gamma \left(\sum_i \sum_j \gamma_{ij}
+ |x_a[i] - x_b[j]|^p \right)^{1/p}
+
+ s.t. \gamma 1 = a,
+ \gamma^T 1= b,
+ \gamma\geq 0
+ where :
+
+ - x_a and x_b are the samples
+ - a and b are the sample weights
+
+ Uses the algorithm detailed in [1]_
+
+ Parameters
+ ----------
+ x_a : (ns,) or (ns, 1) ndarray, float64
+ Source dirac locations (on the real line)
+ x_b : (nt,) or (ns, 1) ndarray, float64
+ Target dirac locations (on the real line)
+ a : (ns,) ndarray, float64, optional
+ Source histogram (default is uniform weight)
+ b : (nt,) ndarray, float64, optional
+ Target histogram (default is uniform weight)
+ p: float, optional (default=1.0)
+ The order of the p-Wasserstein distance to be computed
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format. Due to implementation details, this function runs faster when
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ log: boolean, optional (default=False)
+ If True, returns a dictionary containing the cost.
+ Otherwise returns only the optimal transportation matrix.
+
+ Returns
+ -------
+ gamma: (ns, nt) ndarray
+ Optimal transportation matrix for the given parameters
+ log: dict
+ If input log is True, a dictionary containing the cost
+
+
+ Examples
+ --------
+
+ Simple example with obvious solution. The function wasserstein_1d accepts
+ lists and performs automatic conversion to numpy arrays
+
+ >>> import ot
+ >>> a=[.5, .5]
+ >>> b=[.5, .5]
+ >>> x_a = [2., 0.]
+ >>> x_b = [0., 3.]
+ >>> ot.wasserstein_1d(x_a, x_b, a, b)
+ array([[0. , 0.5],
+ [0.5, 0. ]])
+ >>> ot.wasserstein_1d(x_a, x_b)
+ array([[0. , 0.5],
+ [0.5, 0. ]])
+
+ References
+ ----------
+
+ .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+ Transport", 2018.
+
+ See Also
+ --------
+ ot.lp.emd_1d : EMD for 1d distributions
+ ot.lp.wasserstein2_1d : Wasserstein for 1d distributions (returns the cost
+ instead of the transportation matrix)
+ """
+ if log:
+ G, log = emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+ dense=dense, log=log)
+ log['cost'] = np.power(log['cost'], 1. / p)
+ return G, log
+ return emd_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+ dense=dense, log=log)
+
+
+def wasserstein2_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1.,
+ dense=True, log=False):
+ """Solves the Wasserstein distance problem between 1d measures and returns
+ the loss
+
+
+ .. math::
+ \gamma = arg\min_\gamma \left( \sum_i \sum_j \gamma_{ij}
+ |x_a[i] - x_b[j]|^p \right)^{1/p}
+
+ s.t. \gamma 1 = a,
+ \gamma^T 1= b,
+ \gamma\geq 0
+ where :
+
+ - x_a and x_b are the samples
+ - a and b are the sample weights
+
+ Uses the algorithm detailed in [1]_
+
+ Parameters
+ ----------
+ x_a : (ns,) or (ns, 1) ndarray, float64
+ Source dirac locations (on the real line)
+ x_b : (nt,) or (ns, 1) ndarray, float64
+ Target dirac locations (on the real line)
+ a : (ns,) ndarray, float64, optional
+ Source histogram (default is uniform weight)
+ b : (nt,) ndarray, float64, optional
+ Target histogram (default is uniform weight)
+ p: float, optional (default=1.0)
+ The order of the p-Wasserstein distance to be computed
+ dense: boolean, optional (default=True)
+ If True, returns math:`\gamma` as a dense ndarray of shape (ns, nt).
+ Otherwise returns a sparse representation using scipy's `coo_matrix`
+ format. Only used if log is set to True. Due to implementation details,
+ this function runs faster when dense is set to False.
+ log: boolean, optional (default=False)
+ If True, returns a dictionary containing the transportation matrix.
+ Otherwise returns only the loss.
+
+ Returns
+ -------
+ loss: float
+ Cost associated to the optimal transportation
+ log: dict
+ If input log is True, a dictionary containing the Optimal transportation
+ matrix for the given parameters
+
+
+ Examples
+ --------
+
+ Simple example with obvious solution. The function wasserstein2_1d accepts
+ lists and performs automatic conversion to numpy arrays
+
+ >>> import ot
+ >>> a=[.5, .5]
+ >>> b=[.5, .5]
+ >>> x_a = [2., 0.]
+ >>> x_b = [0., 3.]
+ >>> ot.wasserstein2_1d(x_a, x_b, a, b)
+ 0.5
+ >>> ot.wasserstein2_1d(x_a, x_b)
+ 0.5
+
+ References
+ ----------
+
+ .. [1] Peyré, G., & Cuturi, M. (2017). "Computational Optimal
+ Transport", 2018.
+
+ See Also
+ --------
+ ot.lp.emd2_1d : EMD for 1d distributions
+ ot.lp.wasserstein_1d : Wasserstein for 1d distributions (returns the
+ transportation matrix instead of the cost)
+ """
+ if log:
+ cost, log = emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+ dense=dense, log=log)
+ cost = np.power(cost, 1. / p)
+ return cost, log
+ return np.power(emd2_1d(x_a=x_a, x_b=x_b, a=a, b=b, metric='minkowski', p=p,
+ dense=dense, log=log), 1. / p) \ No newline at end of file
diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx
index 2825ba2..7134136 100644
--- a/ot/lp/emd_wrap.pyx
+++ b/ot/lp/emd_wrap.pyx
@@ -103,7 +103,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
np.ndarray[double, ndim=1, mode="c"] v_weights,
np.ndarray[double, ndim=1, mode="c"] u,
np.ndarray[double, ndim=1, mode="c"] v,
- str metric='sqeuclidean'):
+ str metric='sqeuclidean',
+ double p=1.):
r"""
Solves the Earth Movers distance problem between sorted 1d measures and
returns the OT matrix and the associated cost
@@ -121,7 +122,10 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
metric: str, optional (default='sqeuclidean')
Metric to be used. Only strings listed in :func:`ot.dist` are accepted.
Due to implementation details, this function runs faster when
- `'sqeuclidean'`, `'cityblock'`, or `'euclidean'` metrics are used.
+ `'sqeuclidean'`, `'minkowski'`, `'cityblock'`, or `'euclidean'` metrics
+ are used.
+ p: float, optional (default=1.0)
+ The p-norm to apply for if metric='minkowski'
Returns
-------
@@ -154,6 +158,8 @@ def emd_1d_sorted(np.ndarray[double, ndim=1, mode="c"] u_weights,
m_ij = (u[i] - v[j]) ** 2
elif metric == 'cityblock' or metric == 'euclidean':
m_ij = abs(u[i] - v[j])
+ elif metric == 'minkowski':
+ m_ij = abs(u[i] - v[j]) ** p
else:
m_ij = dist(u[i].reshape((1, 1)), v[j].reshape((1, 1)),
metric=metric)[0, 0]