diff options
Diffstat (limited to 'ot')
-rw-r--r-- | ot/__init__.py | 32 | ||||
-rw-r--r-- | ot/bregman.py | 570 | ||||
-rw-r--r-- | ot/da.py | 674 | ||||
-rw-r--r-- | ot/datasets.py | 23 | ||||
-rw-r--r-- | ot/dr.py | 4 | ||||
-rw-r--r-- | ot/gpu/__init__.py | 5 | ||||
-rw-r--r-- | ot/gromov.py | 206 | ||||
-rw-r--r-- | ot/lp/EMD.h | 2 | ||||
-rw-r--r-- | ot/lp/EMD_wrapper.cpp | 9 | ||||
-rw-r--r-- | ot/lp/__init__.py | 247 | ||||
-rw-r--r-- | ot/lp/emd_wrap.pyx | 27 | ||||
-rw-r--r-- | ot/lp/network_simplex_simple.h | 7 | ||||
-rw-r--r-- | ot/optim.py | 8 | ||||
-rwxr-xr-x | ot/partial.py | 1062 | ||||
-rw-r--r-- | ot/plot.py | 3 | ||||
-rw-r--r-- | ot/smooth.py | 4 | ||||
-rw-r--r-- | ot/unbalanced.py | 107 | ||||
-rw-r--r-- | ot/utils.py | 28 |
18 files changed, 2712 insertions, 306 deletions
diff --git a/ot/__init__.py b/ot/__init__.py index 89c7936..0e6e2e2 100644 --- a/ot/__init__.py +++ b/ot/__init__.py @@ -1,34 +1,5 @@ """ -This is the main module of the POT toolbox. It provides easy access to -a number of sub-modules and functions described below. - -.. note:: - - - Here is a list of the submodules and short description of what they contain. - - - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems. - - :any:`ot.bregman` contains OT solvers for the entropic OT problems using - Bregman projections. - - :any:`ot.lp` contains OT solvers for the exact (Linear Program) OT problems. - - :any:`ot.smooth` contains OT solvers for the regularized (l2 and kl) smooth OT - problems. - - :any:`ot.gromov` contains solvers for Gromov-Wasserstein and Fused Gromov - Wasserstein problems. - - :any:`ot.optim` contains generic solvers OT based optimization problems - - :any:`ot.da` contains classes and function related to Monge mapping - estimation and Domain Adaptation (DA). - - :any:`ot.gpu` contains GPU (cupy) implementation of some OT solvers - - :any:`ot.dr` contains Dimension Reduction (DR) methods such as Wasserstein - Discriminant Analysis. - - :any:`ot.utils` contains utility functions such as distance computation and - timing. - - :any:`ot.datasets` contains toy dataset generation functions. - - :any:`ot.plot` contains visualization functions - - :any:`ot.stochastic` contains stochastic solvers for regularized OT. - - :any:`ot.unbalanced` contains solvers for regularized unbalanced OT. - .. warning:: The list of automatically imported sub-modules is as follows: :py:mod:`ot.lp`, :py:mod:`ot.bregman`, :py:mod:`ot.optim` @@ -61,6 +32,7 @@ from . import gromov from . import smooth from . import stochastic from . import unbalanced +from . import partial # OT functions from .lp import emd, emd2, emd_1d, emd2_1d, wasserstein_1d @@ -71,7 +43,7 @@ from .da import sinkhorn_lpl1_mm # utils functions from .utils import dist, unif, tic, toc, toq -__version__ = "0.6.0" +__version__ = "0.7.0" __all__ = ['emd', 'emd2', 'emd_1d', 'sinkhorn', 'sinkhorn2', 'utils', 'datasets', 'bregman', 'lp', 'tic', 'toc', 'toq', 'gromov', diff --git a/ot/bregman.py b/ot/bregman.py index 2cd832b..f1f8437 100644 --- a/ot/bregman.py +++ b/ot/bregman.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Bregman projections for regularized OT +Bregman projections solvers for entropic regularized OT """ # Author: Remi Flamary <remi.flamary@unice.fr> @@ -8,12 +8,16 @@ Bregman projections for regularized OT # Kilian Fatras <kilian.fatras@irisa.fr> # Titouan Vayer <titouan.vayer@irisa.fr> # Hicham Janati <hicham.janati@inria.fr> +# Mokhtar Z. Alaya <mokhtarzahdi.alaya@gmail.com> +# Alexander Tong <alexander.tong@yale.edu> +# Ievgen Redko <ievgen.redko@univ-st-etienne.fr> # # License: MIT License import numpy as np import warnings from .utils import unif, dist +from scipy.optimize import fmin_l_bfgs_b def sinkhorn(a, b, M, reg, method='sinkhorn', numItermax=1000, @@ -536,12 +540,12 @@ def greenkhorn(a, b, M, reg, numItermax=10000, stopThr=1e-9, verbose=False, old_v = v[i_2] v[i_2] = b[i_2] / (K[:, i_2].T.dot(u)) G[:, i_2] = u * K[:, i_2] * v[i_2] - #aviol = (G@one_m - a) - #aviol_2 = (G.T@one_n - b) + # aviol = (G@one_m - a) + # aviol_2 = (G.T@one_n - b) viol += (-old_v + v[i_2]) * K[:, i_2] * u viol_2[i_2] = v[i_2] * K[:, i_2].dot(u) - b[i_2] - #print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2))) + # print('b',np.max(abs(aviol -viol)),np.max(abs(aviol_2 - viol_2))) if stopThr_val <= stopThr: break @@ -905,11 +909,6 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, else: alpha, beta = warmstart - def get_K(alpha, beta): - """log space computation""" - return np.exp(-(M - alpha.reshape((dim_a, 1)) - - beta.reshape((1, dim_b))) / reg) - # print(np.min(K)) def get_reg(n): # exponential decreasing return (epsilon0 - reg) * np.exp(-n) + reg @@ -937,7 +936,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, # the 10th iterations transp = G err = np.linalg.norm( - (np.sum(transp, axis=0) - b))**2 + np.linalg.norm((np.sum(transp, axis=1) - a))**2 + (np.sum(transp, axis=0) - b)) ** 2 + np.linalg.norm((np.sum(transp, axis=1) - a)) ** 2 if log: log['err'].append(err) @@ -963,7 +962,7 @@ def sinkhorn_epsilon_scaling(a, b, M, reg, numItermax=100, epsilon0=1e4, def geometricBar(weights, alldistribT): """return the weighted geometric mean of distributions""" - assert(len(weights) == alldistribT.shape[1]) + assert (len(weights) == alldistribT.shape[1]) return np.exp(np.dot(np.log(alldistribT), weights.T)) @@ -1037,11 +1036,13 @@ def barycenter(A, M, reg, weights=None, method="sinkhorn", numItermax=10000, """ if method.lower() == 'sinkhorn': - return barycenter_sinkhorn(A, M, reg, numItermax=numItermax, + return barycenter_sinkhorn(A, M, reg, weights=weights, + numItermax=numItermax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) elif method.lower() == 'sinkhorn_stabilized': - return barycenter_stabilized(A, M, reg, numItermax=numItermax, + return barycenter_stabilized(A, M, reg, weights=weights, + numItermax=numItermax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) else: @@ -1103,7 +1104,7 @@ def barycenter_sinkhorn(A, M, reg, weights=None, numItermax=1000, if weights is None: weights = np.ones(A.shape[1]) / A.shape[1] else: - assert(len(weights) == A.shape[1]) + assert (len(weights) == A.shape[1]) if log: log = {'err': []} @@ -1201,7 +1202,7 @@ def barycenter_stabilized(A, M, reg, tau=1e10, weights=None, numItermax=1000, if weights is None: weights = np.ones(n_hists) / n_hists else: - assert(len(weights) == A.shape[1]) + assert (len(weights) == A.shape[1]) if log: log = {'err': []} @@ -1329,7 +1330,7 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, if weights is None: weights = np.ones(A.shape[0]) / A.shape[0] else: - assert(len(weights) == A.shape[0]) + assert (len(weights) == A.shape[0]) if log: log = {'err': []} @@ -1342,12 +1343,17 @@ def convolutional_barycenter2d(A, reg, weights=None, numItermax=10000, err = 1 # build the convolution operator + # this is equivalent to blurring on horizontal then vertical directions t = np.linspace(0, 1, A.shape[1]) [Y, X] = np.meshgrid(t, t) - xi1 = np.exp(-(X - Y)**2 / reg) + xi1 = np.exp(-(X - Y) ** 2 / reg) + + t = np.linspace(0, 1, A.shape[2]) + [Y, X] = np.meshgrid(t, t) + xi2 = np.exp(-(X - Y) ** 2 / reg) def K(x): - return np.dot(np.dot(xi1, x), xi1) + return np.dot(np.dot(xi1, x), xi2) while (err > stopThr and cpt < numItermax): @@ -1492,6 +1498,164 @@ def unmix(a, D, M, M0, h0, reg, reg0, alpha, numItermax=1000, return np.sum(K0, axis=1) +def jcpot_barycenter(Xs, Ys, Xt, reg, metric='sqeuclidean', numItermax=100, + stopThr=1e-6, verbose=False, log=False, **kwargs): + r'''Joint OT and proportion estimation for multi-source target shift as proposed in [27] + + The function solves the following optimization problem: + + .. math:: + + \mathbf{h} = arg\min_{\mathbf{h}}\quad \sum_{k=1}^{K} \lambda_k + W_{reg}((\mathbf{D}_2^{(k)} \mathbf{h})^T, \mathbf{a}) + + s.t. \ \forall k, \mathbf{D}_1^{(k)} \gamma_k \mathbf{1}_n= \mathbf{h} + + where : + + - :math:`\lambda_k` is the weight of k-th source domain + - :math:`W_{reg}(\cdot,\cdot)` is the entropic regularized Wasserstein distance (see ot.bregman.sinkhorn) + - :math:`\mathbf{D}_2^{(k)}` is a matrix of weights related to k-th source domain defined as in [p. 5, 27], its expected shape is `(n_k, C)` where `n_k` is the number of elements in the k-th source domain and `C` is the number of classes + - :math:`\mathbf{h}` is a vector of estimated proportions in the target domain of size C + - :math:`\mathbf{a}` is a uniform vector of weights in the target domain of size `n` + - :math:`\mathbf{D}_1^{(k)}` is a matrix of class assignments defined as in [p. 5, 27], its expected shape is `(n_k, C)` + + The problem consist in solving a Wasserstein barycenter problem to estimate the proportions :math:`\mathbf{h}` in the target domain. + + The algorithm used for solving the problem is the Iterative Bregman projections algorithm + with two sets of marginal constraints related to the unknown vector :math:`\mathbf{h}` and uniform target distribution. + + Parameters + ---------- + Xs : list of K np.ndarray(nsk,d) + features of all source domains' samples + Ys : list of K np.ndarray(nsk,) + labels of all source domains' samples + Xt : np.ndarray (nt,d) + samples in the target domain + reg : float + Regularization term > 0 + metric : string, optional (default="sqeuclidean") + The ground metric for the Wasserstein problem + numItermax : int, optional + Max number of iterations + stopThr : float, optional + Stop threshold on relative change in the barycenter (>0) + log : bool, optional + record log if True + verbose : bool, optional (default=False) + Controls the verbosity of the optimization algorithm + + Returns + ------- + h : (C,) ndarray + proportion estimation in the target domain + log : dict + log dictionary return only if log==True in parameters + + + References + ---------- + + .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia + "Optimal transport for multi-source domain adaptation under target shift", + International Conference on Artificial Intelligence and Statistics (AISTATS), 2019. + + ''' + nbclasses = len(np.unique(Ys[0])) + nbdomains = len(Xs) + + # log dictionary + if log: + log = {'niter': 0, 'err': [], 'M': [], 'D1': [], 'D2': [], 'gamma': []} + + K = [] + M = [] + D1 = [] + D2 = [] + + # For each source domain, build cost matrices M, Gibbs kernels K and corresponding matrices D_1 and D_2 + for d in range(nbdomains): + dom = {} + nsk = Xs[d].shape[0] # get number of elements for this domain + dom['nbelem'] = nsk + classes = np.unique(Ys[d]) # get number of classes for this domain + + # format classes to start from 0 for convenience + if np.min(classes) != 0: + Ys[d] = Ys[d] - np.min(classes) + classes = np.unique(Ys[d]) + + # build the corresponding D_1 and D_2 matrices + Dtmp1 = np.zeros((nbclasses, nsk)) + Dtmp2 = np.zeros((nbclasses, nsk)) + + for c in classes: + nbelemperclass = np.sum(Ys[d] == c) + if nbelemperclass != 0: + Dtmp1[int(c), Ys[d] == c] = 1. + Dtmp2[int(c), Ys[d] == c] = 1. / (nbelemperclass) + D1.append(Dtmp1) + D2.append(Dtmp2) + + # build the cost matrix and the Gibbs kernel + Mtmp = dist(Xs[d], Xt, metric=metric) + M.append(Mtmp) + + Ktmp = np.empty(Mtmp.shape, dtype=Mtmp.dtype) + np.divide(Mtmp, -reg, out=Ktmp) + np.exp(Ktmp, out=Ktmp) + K.append(Ktmp) + + # uniform target distribution + a = unif(np.shape(Xt)[0]) + + cpt = 0 # iterations count + err = 1 + old_bary = np.ones((nbclasses)) + + while (err > stopThr and cpt < numItermax): + + bary = np.zeros((nbclasses)) + + # update coupling matrices for marginal constraints w.r.t. uniform target distribution + for d in range(nbdomains): + K[d] = projC(K[d], a) + other = np.sum(K[d], axis=1) + bary = bary + np.log(np.dot(D1[d], other)) / nbdomains + + bary = np.exp(bary) + + # update coupling matrices for marginal constraints w.r.t. unknown proportions based on [Prop 4., 27] + for d in range(nbdomains): + new = np.dot(D2[d].T, bary) + K[d] = projR(K[d], new) + + err = np.linalg.norm(bary - old_bary) + cpt = cpt + 1 + old_bary = bary + + if log: + log['err'].append(err) + + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err)) + + bary = bary / np.sum(bary) + + if log: + log['niter'] = cpt + log['M'] = M + log['D1'] = D1 + log['D2'] = D2 + log['gamma'] = K + return bary, log + else: + return bary + + def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): @@ -1583,7 +1747,8 @@ def empirical_sinkhorn(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', return pi -def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): +def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, + verbose=False, log=False, **kwargs): r''' Solve the entropic regularization optimal transport problem from empirical data and return the OT loss @@ -1665,14 +1830,17 @@ def empirical_sinkhorn2(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', num M = dist(X_s, X_t, metric=metric) if log: - sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) + sinkhorn_loss, log = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, + **kwargs) return sinkhorn_loss, log else: - sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) + sinkhorn_loss = sinkhorn2(a, b, M, reg, numItermax=numIterMax, stopThr=stopThr, verbose=verbose, log=log, + **kwargs) return sinkhorn_loss -def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, verbose=False, log=False, **kwargs): +def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeuclidean', numIterMax=10000, stopThr=1e-9, + verbose=False, log=False, **kwargs): r''' Compute the sinkhorn divergence loss from empirical data @@ -1758,11 +1926,14 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli .. [23] Aude Genevay, Gabriel Peyré, Marco Cuturi, Learning Generative Models with Sinkhorn Divergences, Proceedings of the Twenty-First International Conference on Artficial Intelligence and Statistics, (AISTATS) 21, 2018 ''' if log: - sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_ab, log_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) - sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_a, log_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) - sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_b, log_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, + stopThr=1e-9, verbose=verbose, log=log, **kwargs) sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b) @@ -1777,11 +1948,354 @@ def empirical_sinkhorn_divergence(X_s, X_t, reg, a=None, b=None, metric='sqeucli return max(0, sinkhorn_div), log else: - sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_ab = empirical_sinkhorn2(X_s, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) - sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_a = empirical_sinkhorn2(X_s, X_s, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) - sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, verbose=verbose, log=log, **kwargs) + sinkhorn_loss_b = empirical_sinkhorn2(X_t, X_t, reg, a, b, metric=metric, numIterMax=numIterMax, stopThr=1e-9, + verbose=verbose, log=log, **kwargs) sinkhorn_div = sinkhorn_loss_ab - 1 / 2 * (sinkhorn_loss_a + sinkhorn_loss_b) return max(0, sinkhorn_div) + + +def screenkhorn(a, b, M, reg, ns_budget=None, nt_budget=None, uniform=False, restricted=True, + maxiter=10000, maxfun=10000, pgtol=1e-09, verbose=False, log=False): + r"""" + Screening Sinkhorn Algorithm for Regularized Optimal Transport + + The function solves an approximate dual of Sinkhorn divergence [2] which is written as the following optimization problem: + + ..math:: + (u, v) = \argmin_{u, v} 1_{ns}^T B(u,v) 1_{nt} - <\kappa u, a> - <v/\kappa, b> + + where B(u,v) = \diag(e^u) K \diag(e^v), with K = e^{-M/reg} and + + s.t. e^{u_i} \geq \epsilon / \kappa, for all i \in {1, ..., ns} + + e^{v_j} \geq \epsilon \kappa, for all j \in {1, ..., nt} + + The parameters \kappa and \epsilon are determined w.r.t the couple number budget of points (ns_budget, nt_budget), see Equation (5) in [26] + + + Parameters + ---------- + a : `numpy.ndarray`, shape=(ns,) + samples weights in the source domain + + b : `numpy.ndarray`, shape=(nt,) + samples weights in the target domain + + M : `numpy.ndarray`, shape=(ns, nt) + Cost matrix + + reg : `float` + Level of the entropy regularisation + + ns_budget : `int`, deafult=None + Number budget of points to be keeped in the source domain + If it is None then 50% of the source sample points will be keeped + + nt_budget : `int`, deafult=None + Number budget of points to be keeped in the target domain + If it is None then 50% of the target sample points will be keeped + + uniform : `bool`, default=False + If `True`, the source and target distribution are supposed to be uniform, i.e., a_i = 1 / ns and b_j = 1 / nt + + restricted : `bool`, default=True + If `True`, a warm-start initialization for the L-BFGS-B solver + using a restricted Sinkhorn algorithm with at most 5 iterations + + maxiter : `int`, default=10000 + Maximum number of iterations in LBFGS solver + + maxfun : `int`, default=10000 + Maximum number of function evaluations in LBFGS solver + + pgtol : `float`, default=1e-09 + Final objective function accuracy in LBFGS solver + + verbose : `bool`, default=False + If `True`, dispaly informations about the cardinals of the active sets and the paramerters kappa + and epsilon + + Dependency + ---------- + To gain more efficiency, screenkhorn needs to call the "Bottleneck" package (https://pypi.org/project/Bottleneck/) + in the screening pre-processing step. If Bottleneck isn't installed, the following error message appears: + "Bottleneck module doesn't exist. Install it from https://pypi.org/project/Bottleneck/" + + + Returns + ------- + gamma : `numpy.ndarray`, shape=(ns, nt) + Screened optimal transportation matrix for the given parameters + + log : `dict`, default=False + Log dictionary return only if log==True in parameters + + + References + ----------- + .. [26] Alaya M. Z., Bérar M., Gasso G., Rakotomamonjy A. (2019). Screening Sinkhorn Algorithm for Regularized Optimal Transport (NIPS) 33, 2019 + + """ + # check if bottleneck module exists + try: + import bottleneck + except ImportError: + warnings.warn( + "Bottleneck module is not installed. Install it from https://pypi.org/project/Bottleneck/ for better performance.") + bottleneck = np + + a = np.asarray(a, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + M = np.asarray(M, dtype=np.float64) + ns, nt = M.shape + + # by default, we keep only 50% of the sample data points + if ns_budget is None: + ns_budget = int(np.floor(0.5 * ns)) + if nt_budget is None: + nt_budget = int(np.floor(0.5 * nt)) + + # calculate the Gibbs kernel + K = np.empty_like(M) + np.divide(M, -reg, out=K) + np.exp(K, out=K) + + def projection(u, epsilon): + u[u <= epsilon] = epsilon + return u + + # ----------------------------------------------------------------------------------------------------------------# + # Step 1: Screening pre-processing # + # ----------------------------------------------------------------------------------------------------------------# + + if ns_budget == ns and nt_budget == nt: + # full number of budget points (ns, nt) = (ns_budget, nt_budget) + Isel = np.ones(ns, dtype=bool) + Jsel = np.ones(nt, dtype=bool) + epsilon = 0.0 + kappa = 1.0 + + cst_u = 0. + cst_v = 0. + + bounds_u = [(0.0, np.inf)] * ns + bounds_v = [(0.0, np.inf)] * nt + + a_I = a + b_J = b + K_IJ = K + K_IJc = [] + K_IcJ = [] + + vec_eps_IJc = np.zeros(nt) + vec_eps_IcJ = np.zeros(ns) + + else: + # sum of rows and columns of K + K_sum_cols = K.sum(axis=1) + K_sum_rows = K.sum(axis=0) + + if uniform: + if ns / ns_budget < 4: + aK_sort = np.sort(K_sum_cols) + epsilon_u_square = a[0] / aK_sort[ns_budget - 1] + else: + aK_sort = bottleneck.partition(K_sum_cols, ns_budget - 1)[ns_budget - 1] + epsilon_u_square = a[0] / aK_sort + + if nt / nt_budget < 4: + bK_sort = np.sort(K_sum_rows) + epsilon_v_square = b[0] / bK_sort[nt_budget - 1] + else: + bK_sort = bottleneck.partition(K_sum_rows, nt_budget - 1)[nt_budget - 1] + epsilon_v_square = b[0] / bK_sort + else: + aK = a / K_sum_cols + bK = b / K_sum_rows + + aK_sort = np.sort(aK)[::-1] + epsilon_u_square = aK_sort[ns_budget - 1] + + bK_sort = np.sort(bK)[::-1] + epsilon_v_square = bK_sort[nt_budget - 1] + + # active sets I and J (see Lemma 1 in [26]) + Isel = a >= epsilon_u_square * K_sum_cols + Jsel = b >= epsilon_v_square * K_sum_rows + + if sum(Isel) != ns_budget: + if uniform: + aK = a / K_sum_cols + aK_sort = np.sort(aK)[::-1] + epsilon_u_square = aK_sort[ns_budget - 1:ns_budget + 1].mean() + Isel = a >= epsilon_u_square * K_sum_cols + ns_budget = sum(Isel) + + if sum(Jsel) != nt_budget: + if uniform: + bK = b / K_sum_rows + bK_sort = np.sort(bK)[::-1] + epsilon_v_square = bK_sort[nt_budget - 1:nt_budget + 1].mean() + Jsel = b >= epsilon_v_square * K_sum_rows + nt_budget = sum(Jsel) + + epsilon = (epsilon_u_square * epsilon_v_square) ** (1 / 4) + kappa = (epsilon_v_square / epsilon_u_square) ** (1 / 2) + + if verbose: + print("epsilon = %s\n" % epsilon) + print("kappa = %s\n" % kappa) + print('Cardinality of selected points: |Isel| = %s \t |Jsel| = %s \n' % (sum(Isel), sum(Jsel))) + + # Ic, Jc: complementary of the active sets I and J + Ic = ~Isel + Jc = ~Jsel + + K_IJ = K[np.ix_(Isel, Jsel)] + K_IcJ = K[np.ix_(Ic, Jsel)] + K_IJc = K[np.ix_(Isel, Jc)] + + K_min = K_IJ.min() + if K_min == 0: + K_min = np.finfo(float).tiny + + # a_I, b_J, a_Ic, b_Jc + a_I = a[Isel] + b_J = b[Jsel] + if not uniform: + a_I_min = a_I.min() + a_I_max = a_I.max() + b_J_max = b_J.max() + b_J_min = b_J.min() + else: + a_I_min = a_I[0] + a_I_max = a_I[0] + b_J_max = b_J[0] + b_J_min = b_J[0] + + # box constraints in L-BFGS-B (see Proposition 1 in [26]) + bounds_u = [(max(a_I_min / ((nt - nt_budget) * epsilon + nt_budget * (b_J_max / ( + ns * epsilon * kappa * K_min))), epsilon / kappa), a_I_max / (nt * epsilon * K_min))] * ns_budget + + bounds_v = [( + max(b_J_min / ((ns - ns_budget) * epsilon + ns_budget * (kappa * a_I_max / (nt * epsilon * K_min))), + epsilon * kappa), b_J_max / (ns * epsilon * K_min))] * nt_budget + + # pre-calculated constants for the objective + vec_eps_IJc = epsilon * kappa * (K_IJc * np.ones(nt - nt_budget).reshape((1, -1))).sum(axis=1) + vec_eps_IcJ = (epsilon / kappa) * (np.ones(ns - ns_budget).reshape((-1, 1)) * K_IcJ).sum(axis=0) + + # initialisation + u0 = np.full(ns_budget, (1. / ns_budget) + epsilon / kappa) + v0 = np.full(nt_budget, (1. / nt_budget) + epsilon * kappa) + + # pre-calculed constants for Restricted Sinkhorn (see Algorithm 1 in supplementary of [26]) + if restricted: + if ns_budget != ns or nt_budget != nt: + cst_u = kappa * epsilon * K_IJc.sum(axis=1) + cst_v = epsilon * K_IcJ.sum(axis=0) / kappa + + cpt = 1 + while cpt < 5: # 5 iterations + K_IJ_v = np.dot(K_IJ.T, u0) + cst_v + v0 = b_J / (kappa * K_IJ_v) + KIJ_u = np.dot(K_IJ, v0) + cst_u + u0 = (kappa * a_I) / KIJ_u + cpt += 1 + + u0 = projection(u0, epsilon / kappa) + v0 = projection(v0, epsilon * kappa) + + else: + u0 = u0 + v0 = v0 + + def restricted_sinkhorn(usc, vsc, max_iter=5): + """ + Restricted Sinkhorn Algorithm as a warm-start initialized point for L-BFGS-B (see Algorithm 1 in supplementary of [26]) + """ + cpt = 1 + while cpt < max_iter: + K_IJ_v = np.dot(K_IJ.T, usc) + cst_v + vsc = b_J / (kappa * K_IJ_v) + KIJ_u = np.dot(K_IJ, vsc) + cst_u + usc = (kappa * a_I) / KIJ_u + cpt += 1 + + usc = projection(usc, epsilon / kappa) + vsc = projection(vsc, epsilon * kappa) + + return usc, vsc + + def screened_obj(usc, vsc): + part_IJ = np.dot(np.dot(usc, K_IJ), vsc) - kappa * np.dot(a_I, np.log(usc)) - (1. / kappa) * np.dot(b_J, + np.log(vsc)) + part_IJc = np.dot(usc, vec_eps_IJc) + part_IcJ = np.dot(vec_eps_IcJ, vsc) + psi_epsilon = part_IJ + part_IJc + part_IcJ + return psi_epsilon + + def screened_grad(usc, vsc): + # gradients of Psi_(kappa,epsilon) w.r.t u and v + grad_u = np.dot(K_IJ, vsc) + vec_eps_IJc - kappa * a_I / usc + grad_v = np.dot(K_IJ.T, usc) + vec_eps_IcJ - (1. / kappa) * b_J / vsc + return grad_u, grad_v + + def bfgspost(theta): + u = theta[:ns_budget] + v = theta[ns_budget:] + # objective + f = screened_obj(u, v) + # gradient + g_u, g_v = screened_grad(u, v) + g = np.hstack([g_u, g_v]) + return f, g + + # ----------------------------------------------------------------------------------------------------------------# + # Step 2: L-BFGS-B solver # + # ----------------------------------------------------------------------------------------------------------------# + + u0, v0 = restricted_sinkhorn(u0, v0) + theta0 = np.hstack([u0, v0]) + + bounds = bounds_u + bounds_v # constraint bounds + + def obj(theta): + return bfgspost(theta) + + theta, _, _ = fmin_l_bfgs_b(func=obj, + x0=theta0, + bounds=bounds, + maxfun=maxfun, + pgtol=pgtol, + maxiter=maxiter) + + usc = theta[:ns_budget] + vsc = theta[ns_budget:] + + usc_full = np.full(ns, epsilon / kappa) + vsc_full = np.full(nt, epsilon * kappa) + usc_full[Isel] = usc + vsc_full[Jsel] = vsc + + if log: + log = {} + log['u'] = usc_full + log['v'] = vsc_full + log['Isel'] = Isel + log['Jsel'] = Jsel + + gamma = usc_full[:, None] * K * vsc_full[None, :] + gamma = gamma / gamma.sum() + + if log: + return gamma, log + else: + return gamma @@ -7,15 +7,16 @@ Domain adaptation with optimal transport # Nicolas Courty <ncourty@irisa.fr> # Michael Perrot <michael.perrot@univ-st-etienne.fr> # Nathalie Gayraud <nat.gayraud@gmail.com> +# Ievgen Redko <ievgen.redko@univ-st-etienne.fr> # # License: MIT License import numpy as np import scipy.linalg as linalg -from .bregman import sinkhorn +from .bregman import sinkhorn, jcpot_barycenter from .lp import emd -from .utils import unif, dist, kernel, cost_normalization +from .utils import unif, dist, kernel, cost_normalization, label_normalization, laplacian, dots from .utils import check_params, BaseEstimator from .unbalanced import sinkhorn_unbalanced from .optim import cg @@ -127,7 +128,7 @@ def sinkhorn_lpl1_mm(a, labels_a, b, M, reg, eta=0.1, numItermax=10, W = np.ones(M.shape) for (i, c) in enumerate(classes): majs = np.sum(transp[indices_labels[i]], axis=0) - majs = p * ((majs + epsilon)**(p - 1)) + majs = p * ((majs + epsilon) ** (p - 1)) W[indices_labels[i]] = majs return transp @@ -359,8 +360,8 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False, def loss(L, G): """Compute full loss""" - return np.sum((xs1.dot(L) - ns * G.dot(xt))**2) + mu * \ - np.sum(G * M) + eta * np.sum(sel(L - I0)**2) + return np.sum((xs1.dot(L) - ns * G.dot(xt)) ** 2) + mu * \ + np.sum(G * M) + eta * np.sum(sel(L - I0) ** 2) def solve_L(G): """ solve L problem with fixed G (least square)""" @@ -372,10 +373,11 @@ def joint_OT_mapping_linear(xs, xt, mu=1, eta=0.001, bias=False, verbose=False, xsi = xs1.dot(L) def f(G): - return np.sum((xsi - ns * G.dot(xt))**2) + return np.sum((xsi - ns * G.dot(xt)) ** 2) def df(G): return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T) + G = cg(a, b, M, 1.0 / mu, f, df, G0=G0, numItermax=numInnerItermax, stopThr=stopInnerThr) return G @@ -562,7 +564,7 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', def loss(L, G): """Compute full loss""" - return np.sum((K1.dot(L) - ns * G.dot(xt))**2) + mu * \ + return np.sum((K1.dot(L) - ns * G.dot(xt)) ** 2) + mu * \ np.sum(G * M) + eta * np.trace(L.T.dot(Kreg).dot(L)) def solve_L_nobias(G): @@ -580,10 +582,11 @@ def joint_OT_mapping_kernel(xs, xt, mu=1, eta=0.001, kerneltype='gaussian', xsi = K1.dot(L) def f(G): - return np.sum((xsi - ns * G.dot(xt))**2) + return np.sum((xsi - ns * G.dot(xt)) ** 2) def df(G): return -2 * ns * (xsi - ns * G.dot(xt)).dot(xt.T) + G = cg(a, b, M, 1.0 / mu, f, df, G0=G0, numItermax=numInnerItermax, stopThr=stopInnerThr) return G @@ -745,6 +748,139 @@ def OT_mapping_linear(xs, xt, reg=1e-6, ws=None, return A, b +def emd_laplace(a, b, xs, xt, M, sim='knn', sim_param=None, reg='pos', eta=1, alpha=.5, + numItermax=100, stopThr=1e-9, numInnerItermax=100000, + stopInnerThr=1e-9, log=False, verbose=False): + r"""Solve the optimal transport problem (OT) with Laplacian regularization + + .. math:: + \gamma = arg\min_\gamma <\gamma,M>_F + eta\Omega_\alpha(\gamma) + + s.t.\ \gamma 1 = a + + \gamma^T 1= b + + \gamma\geq 0 + + where: + + - a and b are source and target weights (sum to 1) + - xs and xt are source and target samples + - M is the (ns,nt) metric cost matrix + - :math:`\Omega_\alpha` is the Laplacian regularization term + :math:`\Omega_\alpha = (1-\alpha)/n_s^2\sum_{i,j}S^s_{i,j}\|T(\mathbf{x}^s_i)-T(\mathbf{x}^s_j)\|^2+\alpha/n_t^2\sum_{i,j}S^t_{i,j}^'\|T(\mathbf{x}^t_i)-T(\mathbf{x}^t_j)\|^2` + with :math:`S^s_{i,j}, S^t_{i,j}` denoting source and target similarity matrices and :math:`T(\cdot)` being a barycentric mapping + + The algorithm used for solving the problem is the conditional gradient algorithm as proposed in [5]. + + Parameters + ---------- + a : np.ndarray (ns,) + samples weights in the source domain + b : np.ndarray (nt,) + samples weights in the target domain + xs : np.ndarray (ns,d) + samples in the source domain + xt : np.ndarray (nt,d) + samples in the target domain + M : np.ndarray (ns,nt) + loss matrix + sim : string, optional + Type of similarity ('knn' or 'gauss') used to construct the Laplacian. + sim_param : int or float, optional + Parameter (number of the nearest neighbors for sim='knn' + or bandwidth for sim='gauss') used to compute the Laplacian. + reg : string + Type of Laplacian regularization + eta : float + Regularization term for Laplacian regularization + alpha : float + Regularization term for source domain's importance in regularization + numItermax : int, optional + Max number of iterations + stopThr : float, optional + Stop threshold on error (inner emd solver) (>0) + numInnerItermax : int, optional + Max number of iterations (inner CG solver) + stopInnerThr : float, optional + Stop threshold on error (inner CG solver) (>0) + verbose : bool, optional + Print information along iterations + log : bool, optional + record log if True + + Returns + ------- + gamma : (ns x nt) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary return only if log==True in parameters + + + References + ---------- + + .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, + "Optimal Transport for Domain Adaptation," in IEEE + Transactions on Pattern Analysis and Machine Intelligence , + vol.PP, no.99, pp.1-1 + .. [30] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy, + "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching," + in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. + + See Also + -------- + ot.lp.emd : Unregularized OT + ot.optim.cg : General regularized OT + + """ + if not isinstance(sim_param, (int, float, type(None))): + raise ValueError( + 'Similarity parameter should be an int or a float. Got {type} instead.'.format(type=type(sim_param).__name__)) + + if sim == 'gauss': + if sim_param is None: + sim_param = 1 / (2 * (np.mean(dist(xs, xs, 'sqeuclidean')) ** 2)) + sS = kernel(xs, xs, method=sim, sigma=sim_param) + sT = kernel(xt, xt, method=sim, sigma=sim_param) + + elif sim == 'knn': + if sim_param is None: + sim_param = 3 + + from sklearn.neighbors import kneighbors_graph + + sS = kneighbors_graph(X=xs, n_neighbors=int(sim_param)).toarray() + sS = (sS + sS.T) / 2 + sT = kneighbors_graph(xt, n_neighbors=int(sim_param)).toarray() + sT = (sT + sT.T) / 2 + else: + raise ValueError('Unknown similarity type {sim}. Currently supported similarity types are "knn" and "gauss".'.format(sim=sim)) + + lS = laplacian(sS) + lT = laplacian(sT) + + def f(G): + return alpha * np.trace(np.dot(xt.T, np.dot(G.T, np.dot(lS, np.dot(G, xt))))) \ + + (1 - alpha) * np.trace(np.dot(xs.T, np.dot(G, np.dot(lT, np.dot(G.T, xs))))) + + ls2 = lS + lS.T + lt2 = lT + lT.T + xt2 = np.dot(xt, xt.T) + + if reg == 'disp': + Cs = -eta * alpha / xs.shape[0] * dots(ls2, xs, xt.T) + Ct = -eta * (1 - alpha) / xt.shape[0] * dots(xs, xt.T, lt2) + M = M + Cs + Ct + + def df(G): + return alpha * np.dot(ls2, np.dot(G, xt2))\ + + (1 - alpha) * np.dot(xs, np.dot(xs.T, np.dot(G, lt2))) + + return cg(a, b, M, reg=eta, f=f, df=df, G0=None, numItermax=numItermax, numItermaxEmd=numInnerItermax, + stopThr=stopThr, stopThr2=stopInnerThr, verbose=verbose, log=log) + + def distribution_estimation_uniform(X): """estimates a uniform distribution from an array of samples X @@ -772,7 +908,8 @@ class BaseTransport(BaseEstimator): at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). - fit method should: + the fit method should: + - estimate a cost matrix and store it in a `cost_` attribute - estimate a coupling matrix and store it in a `coupling_` attribute @@ -783,6 +920,9 @@ class BaseTransport(BaseEstimator): transform method should always get as input a Xs parameter inverse_transform method should always get as input a Xt parameter + + transform_labels method should always get as input a ys parameter + inverse_transform_labels method should always get as input a yt parameter """ def fit(self, Xs=None, ys=None, Xt=None, yt=None): @@ -794,7 +934,7 @@ class BaseTransport(BaseEstimator): Xs : array-like, shape (n_source_samples, n_features) The training input samples. ys : array-like, shape (n_source_samples,) - The class labels + The training class labels Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) @@ -855,7 +995,7 @@ class BaseTransport(BaseEstimator): Xs : array-like, shape (n_source_samples, n_features) The training input samples. ys : array-like, shape (n_source_samples,) - The class labels + The class labels for training samples Xt : array-like, shape (n_target_samples, n_features) The training input samples. yt : array-like, shape (n_target_samples,) @@ -879,13 +1019,13 @@ class BaseTransport(BaseEstimator): Parameters ---------- Xs : array-like, shape (n_source_samples, n_features) - The training input samples. + The source input samples. ys : array-like, shape (n_source_samples,) - The class labels + The class labels for source samples Xt : array-like, shape (n_target_samples, n_features) - The training input samples. + The target input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the + The class labels for target. If some target samples are unlabeled, fill the yt's elements with -1. Warning: Note that, due to this convention -1 cannot be used as a @@ -921,7 +1061,6 @@ class BaseTransport(BaseEstimator): transp_Xs = [] for bi in batch_ind: - # get the nearest neighbor in the source domain D0 = dist(Xs[bi], self.xs_) idx = np.argmin(D0, axis=1) @@ -941,20 +1080,64 @@ class BaseTransport(BaseEstimator): return transp_Xs + def transform_labels(self, ys=None): + """Propagate source labels ys to obtain estimated target labels as in [27] + + Parameters + ---------- + ys : array-like, shape (n_source_samples,) + The source class labels + + Returns + ------- + transp_ys : array-like, shape (n_target_samples, nb_classes) + Estimated soft target labels. + + References + ---------- + + .. [27] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia + "Optimal transport for multi-source domain adaptation under target shift", + International Conference on Artificial Intelligence and Statistics (AISTATS), 2019. + + """ + + # check the necessary inputs parameters are here + if check_params(ys=ys): + + ysTemp = label_normalization(np.copy(ys)) + classes = np.unique(ysTemp) + n = len(classes) + D1 = np.zeros((n, len(ysTemp))) + + # perform label propagation + transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None] + + # set nans to 0 + transp[~ np.isfinite(transp)] = 0 + + for c in classes: + D1[int(c), ysTemp == c] = 1 + + # compute propagated labels + transp_ys = np.dot(D1, transp) + + return transp_ys.T + def inverse_transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): - """Transports target samples Xt onto target samples Xs + """Transports target samples Xt onto source samples Xs Parameters ---------- Xs : array-like, shape (n_source_samples, n_features) - The training input samples. + The source input samples. ys : array-like, shape (n_source_samples,) - The class labels + The source class labels Xt : array-like, shape (n_target_samples, n_features) - The training input samples. + The target input samples. yt : array-like, shape (n_target_samples,) - The class labels. If some target samples are unlabeled, fill the + The target class labels. If some target samples are unlabeled, fill the yt's elements with -1. Warning: Note that, due to this convention -1 cannot be used as a @@ -990,7 +1173,6 @@ class BaseTransport(BaseEstimator): transp_Xt = [] for bi in batch_ind: - D0 = dist(Xt[bi], self.xt_) idx = np.argmin(D0, axis=1) @@ -1009,6 +1191,41 @@ class BaseTransport(BaseEstimator): return transp_Xt + def inverse_transform_labels(self, yt=None): + """Propagate target labels yt to obtain estimated source labels ys + + Parameters + ---------- + yt : array-like, shape (n_target_samples,) + + Returns + ------- + transp_ys : array-like, shape (n_source_samples, nb_classes) + Estimated soft source labels. + """ + + # check the necessary inputs parameters are here + if check_params(yt=yt): + + ytTemp = label_normalization(np.copy(yt)) + classes = np.unique(ytTemp) + n = len(classes) + D1 = np.zeros((n, len(ytTemp))) + + # perform label propagation + transp = self.coupling_ / np.sum(self.coupling_, 1)[:, None] + + # set nans to 0 + transp[~ np.isfinite(transp)] = 0 + + for c in classes: + D1[int(c), ytTemp == c] = 1 + + # compute propagated samples + transp_ys = np.dot(D1, transp.T) + + return transp_ys.T + class LinearTransport(BaseTransport): """ OT linear operator between empirical distributions @@ -1055,7 +1272,6 @@ class LinearTransport(BaseTransport): def __init__(self, reg=1e-8, bias=True, log=False, distribution_estimation=distribution_estimation_uniform): - self.bias = bias self.log = log self.reg = reg @@ -1136,7 +1352,6 @@ class LinearTransport(BaseTransport): # check the necessary inputs parameters are here if check_params(Xs=Xs): - transp_Xs = Xs.dot(self.A_) + self.B_ return transp_Xs @@ -1170,7 +1385,6 @@ class LinearTransport(BaseTransport): # check the necessary inputs parameters are here if check_params(Xt=Xt): - transp_Xt = Xt.dot(self.A1_) + self.B1_ return transp_Xt @@ -1224,6 +1438,9 @@ class SinkhornTransport(BaseTransport): .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, reg_e=1., max_iter=1000, @@ -1231,7 +1448,6 @@ class SinkhornTransport(BaseTransport): metric="sqeuclidean", norm=None, distribution_estimation=distribution_estimation_uniform, out_of_sample_map='ferradans', limit_max=np.infty): - self.reg_e = reg_e self.max_iter = max_iter self.tol = tol @@ -1323,13 +1539,15 @@ class EMDTransport(BaseTransport): .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, metric="sqeuclidean", norm=None, log=False, distribution_estimation=distribution_estimation_uniform, out_of_sample_map='ferradans', limit_max=10, max_iter=100000): - self.metric = metric self.norm = norm self.log = log @@ -1431,7 +1649,9 @@ class SinkhornLpl1Transport(BaseTransport): .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. - + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, reg_e=1., reg_cl=0.1, @@ -1440,7 +1660,6 @@ class SinkhornLpl1Transport(BaseTransport): metric="sqeuclidean", norm=None, distribution_estimation=distribution_estimation_uniform, out_of_sample_map='ferradans', limit_max=np.infty): - self.reg_e = reg_e self.reg_cl = reg_cl self.max_iter = max_iter @@ -1481,7 +1700,6 @@ class SinkhornLpl1Transport(BaseTransport): # check the necessary inputs parameters are here if check_params(Xs=Xs, Xt=Xt, ys=ys): - super(SinkhornLpl1Transport, self).fit(Xs, ys, Xt, yt) returned_ = sinkhorn_lpl1_mm( @@ -1499,6 +1717,127 @@ class SinkhornLpl1Transport(BaseTransport): return self +class EMDLaplaceTransport(BaseTransport): + + """Domain Adapatation OT method based on Earth Mover's Distance with Laplacian regularization + + Parameters + ---------- + reg_type : string optional (default='pos') + Type of the regularization term: 'pos' and 'disp' for + regularization term defined in [2] and [6], respectively. + reg_lap : float, optional (default=1) + Laplacian regularization parameter + reg_src : float, optional (default=0.5) + Source relative importance in regularization + metric : string, optional (default="sqeuclidean") + The ground metric for the Wasserstein problem + norm : string, optional (default=None) + If given, normalize the ground metric to avoid numerical errors that + can occur with large metric values. + similarity : string, optional (default="knn") + The similarity to use either knn or gaussian + similarity_param : int or float, optional (default=None) + Parameter for the similarity: number of nearest neighbors or bandwidth + if similarity="knn" or "gaussian", respectively. If None is provided, + it is set to 3 or the average pairwise squared Euclidean distance, respectively. + max_iter : int, optional (default=100) + Max number of BCD iterations + tol : float, optional (default=1e-5) + Stop threshold on relative loss decrease (>0) + max_inner_iter : int, optional (default=10) + Max number of iterations (inner CG solver) + inner_tol : float, optional (default=1e-6) + Stop threshold on error (inner CG solver) (>0) + log : int, optional (default=False) + Controls the logs of the optimization algorithm + distribution_estimation : callable, optional (defaults to the uniform) + The kind of distribution estimation to employ + out_of_sample_map : string, optional (default="ferradans") + The kind of out of sample mapping to apply to transport samples + from a domain into another one. Currently the only possible option is + "ferradans" which uses the method proposed in [6]. + + Attributes + ---------- + coupling_ : array-like, shape (n_source_samples, n_target_samples) + The optimal coupling + + References + ---------- + .. [1] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, + "Optimal Transport for Domain Adaptation," in IEEE Transactions + on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 + .. [2] R. Flamary, N. Courty, D. Tuia, A. Rakotomamonjy, + "Optimal transport with Laplacian regularization: Applications to domain adaptation and shape matching," + in NIPS Workshop on Optimal Transport and Machine Learning OTML, 2014. + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. + """ + + def __init__(self, reg_type='pos', reg_lap=1., reg_src=1., metric="sqeuclidean", + norm=None, similarity="knn", similarity_param=None, max_iter=100, tol=1e-9, + max_inner_iter=100000, inner_tol=1e-9, log=False, verbose=False, + distribution_estimation=distribution_estimation_uniform, + out_of_sample_map='ferradans'): + self.reg = reg_type + self.reg_lap = reg_lap + self.reg_src = reg_src + self.metric = metric + self.norm = norm + self.similarity = similarity + self.sim_param = similarity_param + self.max_iter = max_iter + self.tol = tol + self.max_inner_iter = max_inner_iter + self.inner_tol = inner_tol + self.log = log + self.verbose = verbose + self.distribution_estimation = distribution_estimation + self.out_of_sample_map = out_of_sample_map + + def fit(self, Xs, ys=None, Xt=None, yt=None): + """Build a coupling matrix from source and target sets of samples + (Xs, ys) and (Xt, yt) + + Parameters + ---------- + Xs : array-like, shape (n_source_samples, n_features) + The training input samples. + ys : array-like, shape (n_source_samples,) + The class labels + Xt : array-like, shape (n_target_samples, n_features) + The training input samples. + yt : array-like, shape (n_target_samples,) + The class labels. If some target samples are unlabeled, fill the + yt's elements with -1. + + Warning: Note that, due to this convention -1 cannot be used as a + class label + + Returns + ------- + self : object + Returns self. + """ + + super(EMDLaplaceTransport, self).fit(Xs, ys, Xt, yt) + + returned_ = emd_laplace(a=self.mu_s, b=self.mu_t, xs=self.xs_, + xt=self.xt_, M=self.cost_, sim=self.similarity, sim_param=self.sim_param, reg=self.reg, eta=self.reg_lap, + alpha=self.reg_src, numItermax=self.max_iter, stopThr=self.tol, numInnerItermax=self.max_inner_iter, + stopInnerThr=self.inner_tol, log=self.log, verbose=self.verbose) + + # coupling estimation + if self.log: + self.coupling_, self.log_ = returned_ + else: + self.coupling_ = returned_ + self.log_ = dict() + return self + + class SinkhornL1l2Transport(BaseTransport): """Domain Adapatation OT method based on sinkhorn algorithm + @@ -1554,7 +1893,9 @@ class SinkhornL1l2Transport(BaseTransport): .. [2] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. - + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, reg_e=1., reg_cl=0.1, @@ -1563,7 +1904,6 @@ class SinkhornL1l2Transport(BaseTransport): metric="sqeuclidean", norm=None, distribution_estimation=distribution_estimation_uniform, out_of_sample_map='ferradans', limit_max=10): - self.reg_e = reg_e self.reg_cl = reg_cl self.max_iter = max_iter @@ -1685,7 +2025,6 @@ class MappingTransport(BaseEstimator): norm=None, kernel="linear", sigma=1, max_iter=100, tol=1e-5, max_inner_iter=10, inner_tol=1e-6, log=False, verbose=False, verbose2=False): - self.metric = metric self.norm = norm self.mu = mu @@ -1848,7 +2187,9 @@ class UnbalancedSinkhornTransport(BaseTransport): .. [1] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). Scaling algorithms for unbalanced transport problems. arXiv preprint arXiv:1607.05816. - + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. """ def __init__(self, reg_e=1., reg_m=0.1, method='sinkhorn', @@ -1856,7 +2197,6 @@ class UnbalancedSinkhornTransport(BaseTransport): metric="sqeuclidean", norm=None, distribution_estimation=distribution_estimation_uniform, out_of_sample_map='ferradans', limit_max=10): - self.reg_e = reg_e self.reg_m = reg_m self.method = method @@ -1914,3 +2254,267 @@ class UnbalancedSinkhornTransport(BaseTransport): self.log_ = dict() return self + + +class JCPOTTransport(BaseTransport): + + """Domain Adapatation OT method for multi-source target shift based on Wasserstein barycenter algorithm. + + Parameters + ---------- + reg_e : float, optional (default=1) + Entropic regularization parameter + max_iter : int, float, optional (default=10) + The minimum number of iteration before stopping the optimization + algorithm if no it has not converged + tol : float, optional (default=10e-9) + Stop threshold on error (inner sinkhorn solver) (>0) + verbose : bool, optional (default=False) + Controls the verbosity of the optimization algorithm + log : bool, optional (default=False) + Controls the logs of the optimization algorithm + metric : string, optional (default="sqeuclidean") + The ground metric for the Wasserstein problem + norm : string, optional (default=None) + If given, normalize the ground metric to avoid numerical errors that + can occur with large metric values. + distribution_estimation : callable, optional (defaults to the uniform) + The kind of distribution estimation to employ + out_of_sample_map : string, optional (default="ferradans") + The kind of out of sample mapping to apply to transport samples + from a domain into another one. Currently the only possible option is + "ferradans" which uses the method proposed in [6]. + + Attributes + ---------- + coupling_ : list of array-like objects, shape K x (n_source_samples, n_target_samples) + A set of optimal couplings between each source domain and the target domain + proportions_ : array-like, shape (n_classes,) + Estimated class proportions in the target domain + log_ : dictionary + The dictionary of log, empty dic if parameter log is not True + + References + ---------- + + .. [1] Ievgen Redko, Nicolas Courty, Rémi Flamary, Devis Tuia + "Optimal transport for multi-source domain adaptation under target shift", + International Conference on Artificial Intelligence and Statistics (AISTATS), + vol. 89, p.849-858, 2019. + + .. [6] Ferradans, S., Papadakis, N., Peyré, G., & Aujol, J. F. (2014). + Regularized discrete optimal transport. SIAM Journal on Imaging + Sciences, 7(3), 1853-1882. + + + """ + + def __init__(self, reg_e=.1, max_iter=10, + tol=10e-9, verbose=False, log=False, + metric="sqeuclidean", + out_of_sample_map='ferradans'): + self.reg_e = reg_e + self.max_iter = max_iter + self.tol = tol + self.verbose = verbose + self.log = log + self.metric = metric + self.out_of_sample_map = out_of_sample_map + + def fit(self, Xs, ys=None, Xt=None, yt=None): + """Building coupling matrices from a list of source and target sets of samples + (Xs, ys) and (Xt, yt) + + Parameters + ---------- + Xs : list of K array-like objects, shape K x (nk_source_samples, n_features) + A list of the training input samples. + ys : list of K array-like objects, shape K x (nk_source_samples,) + A list of the class labels + Xt : array-like, shape (n_target_samples, n_features) + The training input samples. + yt : array-like, shape (n_target_samples,) + The class labels. If some target samples are unlabeled, fill the + yt's elements with -1. + + Warning: Note that, due to this convention -1 cannot be used as a + class label + + Returns + ------- + self : object + Returns self. + """ + + # check the necessary inputs parameters are here + if check_params(Xs=Xs, Xt=Xt, ys=ys): + + self.xs_ = Xs + self.xt_ = Xt + + returned_ = jcpot_barycenter(Xs=Xs, Ys=ys, Xt=Xt, reg=self.reg_e, + metric=self.metric, distrinumItermax=self.max_iter, stopThr=self.tol, + verbose=self.verbose, log=True) + + self.coupling_ = returned_[1]['gamma'] + + # deal with the value of log + if self.log: + self.proportions_, self.log_ = returned_ + else: + self.proportions_ = returned_ + self.log_ = dict() + + return self + + def transform(self, Xs=None, ys=None, Xt=None, yt=None, batch_size=128): + """Transports source samples Xs onto target ones Xt + + Parameters + ---------- + Xs : list of K array-like objects, shape K x (nk_source_samples, n_features) + A list of the training input samples. + ys : list of K array-like objects, shape K x (nk_source_samples,) + A list of the class labels + Xt : array-like, shape (n_target_samples, n_features) + The training input samples. + yt : array-like, shape (n_target_samples,) + The class labels. If some target samples are unlabeled, fill the + yt's elements with -1. + + Warning: Note that, due to this convention -1 cannot be used as a + class label + batch_size : int, optional (default=128) + The batch size for out of sample inverse transform + """ + + transp_Xs = [] + + # check the necessary inputs parameters are here + if check_params(Xs=Xs): + + if all([np.allclose(x, y) for x, y in zip(self.xs_, Xs)]): + + # perform standard barycentric mapping for each source domain + + for coupling in self.coupling_: + transp = coupling / np.sum(coupling, 1)[:, None] + + # set nans to 0 + transp[~ np.isfinite(transp)] = 0 + + # compute transported samples + transp_Xs.append(np.dot(transp, self.xt_)) + else: + + # perform out of sample mapping + indices = np.arange(Xs.shape[0]) + batch_ind = [ + indices[i:i + batch_size] + for i in range(0, len(indices), batch_size)] + + transp_Xs = [] + + for bi in batch_ind: + transp_Xs_ = [] + + # get the nearest neighbor in the sources domains + xs = np.concatenate(self.xs_, axis=0) + idx = np.argmin(dist(Xs[bi], xs), axis=1) + + # transport the source samples + for coupling in self.coupling_: + transp = coupling / np.sum( + coupling, 1)[:, None] + transp[~ np.isfinite(transp)] = 0 + transp_Xs_.append(np.dot(transp, self.xt_)) + + transp_Xs_ = np.concatenate(transp_Xs_, axis=0) + + # define the transported points + transp_Xs_ = transp_Xs_[idx, :] + Xs[bi] - xs[idx, :] + transp_Xs.append(transp_Xs_) + + transp_Xs = np.concatenate(transp_Xs, axis=0) + + return transp_Xs + + def transform_labels(self, ys=None): + """Propagate source labels ys to obtain target labels as in [27] + + Parameters + ---------- + ys : list of K array-like objects, shape K x (nk_source_samples,) + A list of the class labels + + Returns + ------- + yt : array-like, shape (n_target_samples, nb_classes) + Estimated soft target labels. + """ + + # check the necessary inputs parameters are here + if check_params(ys=ys): + yt = np.zeros((len(np.unique(np.concatenate(ys))), self.xt_.shape[0])) + for i in range(len(ys)): + ysTemp = label_normalization(np.copy(ys[i])) + classes = np.unique(ysTemp) + n = len(classes) + ns = len(ysTemp) + + # perform label propagation + transp = self.coupling_[i] / np.sum(self.coupling_[i], 1)[:, None] + + # set nans to 0 + transp[~ np.isfinite(transp)] = 0 + + if self.log: + D1 = self.log_['D1'][i] + else: + D1 = np.zeros((n, ns)) + + for c in classes: + D1[int(c), ysTemp == c] = 1 + + # compute propagated labels + yt = yt + np.dot(D1, transp) / len(ys) + + return yt.T + + def inverse_transform_labels(self, yt=None): + """Propagate source labels ys to obtain target labels + + Parameters + ---------- + yt : array-like, shape (n_source_samples,) + The target class labels + + Returns + ------- + transp_ys : list of K array-like objects, shape K x (nk_source_samples, nb_classes) + A list of estimated soft source labels + """ + + # check the necessary inputs parameters are here + if check_params(yt=yt): + transp_ys = [] + ytTemp = label_normalization(np.copy(yt)) + classes = np.unique(ytTemp) + n = len(classes) + D1 = np.zeros((n, len(ytTemp))) + + for c in classes: + D1[int(c), ytTemp == c] = 1 + + for i in range(len(self.xs_)): + + # perform label propagation + transp = self.coupling_[i] / np.sum(self.coupling_[i], 1)[:, None] + + # set nans to 0 + transp[~ np.isfinite(transp)] = 0 + + # compute propagated labels + transp_ys.append(np.dot(D1, transp.T).T) + + return transp_ys diff --git a/ot/datasets.py b/ot/datasets.py index ba0cfd9..b86ef3b 100644 --- a/ot/datasets.py +++ b/ot/datasets.py @@ -1,5 +1,5 @@ """ -Simple example datasets for OT +Simple example datasets """ # Author: Remi Flamary <remi.flamary@unice.fr> @@ -30,7 +30,7 @@ def make_1D_gauss(n, m, s): 1D histogram for a gaussian distribution """ x = np.arange(n, dtype=np.float64) - h = np.exp(-(x - m)**2 / (2 * s**2)) + h = np.exp(-(x - m) ** 2 / (2 * s ** 2)) return h / h.sum() @@ -80,7 +80,7 @@ def get_2D_samples_gauss(n, m, sigma, random_state=None): return make_2D_samples_gauss(n, m, sigma, random_state=None) -def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs): +def make_data_classif(dataset, n, nz=.5, theta=0, p=.5, random_state=None, **kwargs): """Dataset generation for classification problems Parameters @@ -91,6 +91,8 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs): number of training samples nz : float noise level (>0) + p : float + proportion of one class in the binary setting random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -145,11 +147,22 @@ def make_data_classif(dataset, n, nz=.5, theta=0, random_state=None, **kwargs): n2 = np.sum(y == 2) x = np.zeros((n, 2)) - x[y == 1, :] = get_2D_samples_gauss(n1, m1, nz, random_state=generator) - x[y == 2, :] = get_2D_samples_gauss(n2, m2, nz, random_state=generator) + x[y == 1, :] = make_2D_samples_gauss(n1, m1, nz, random_state=generator) + x[y == 2, :] = make_2D_samples_gauss(n2, m2, nz, random_state=generator) x = x.dot(rot) + elif dataset.lower() == '2gauss_prop': + + y = np.concatenate((np.ones(int(p * n)), np.zeros(int((1 - p) * n)))) + x = np.hstack((0 * y[:, None] - 0, 1 - 2 * y[:, None])) + nz * np.random.randn(len(y), 2) + + if ('bias' not in kwargs) and ('b' not in kwargs): + kwargs['bias'] = np.array([0, 2]) + + x[:, 0] += kwargs['bias'][0] + x[:, 1] += kwargs['bias'][1] + else: x = np.array(0) y = np.array(0) @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- """ -Dimension reduction with optimal transport +Dimension reduction with OT .. warning:: - Note that by default the module is not import in :mod:`ot`. In order to + Note that by default the module is not imported in :mod:`ot`. In order to use it you need to explicitely import :mod:`ot.dr` """ diff --git a/ot/gpu/__init__.py b/ot/gpu/__init__.py index 1ab95bb..7478fb9 100644 --- a/ot/gpu/__init__.py +++ b/ot/gpu/__init__.py @@ -1,8 +1,9 @@ # -*- coding: utf-8 -*- """ +GPU implementation for several OT solvers and utility +functions. -This module provides GPU implementation for several OT solvers and utility -functions. The GPU backend in handled by `cupy +The GPU backend in handled by `cupy <https://cupy.chainer.org/>`_. .. warning:: diff --git a/ot/gromov.py b/ot/gromov.py index 699ae4c..4427a96 100644 --- a/ot/gromov.py +++ b/ot/gromov.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*-
"""
-Gromov-Wasserstein transport method
+Gromov-Wasserstein and Fused-Gromov-Wasserstein solvers
"""
# Author: Erwan Vautier <erwan.vautier@gmail.com>
@@ -276,7 +276,6 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs - p : distribution in the source space
- q : distribution in the target space
- L : loss function to account for the misfit between the similarity matrices
- - H : entropy
Parameters
----------
@@ -343,6 +342,83 @@ def gromov_wasserstein(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs return cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
+ """
+ Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
+
+ The function solves the following optimization problem:
+
+ .. math::
+ GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
+
+ Where :
+ - C1 : Metric cost matrix in the source space
+ - C2 : Metric cost matrix in the target space
+ - p : distribution in the source space
+ - q : distribution in the target space
+ - L : loss function to account for the misfit between the similarity matrices
+
+ Parameters
+ ----------
+ C1 : ndarray, shape (ns, ns)
+ Metric cost matrix in the source space
+ C2 : ndarray, shape (nt, nt)
+ Metric cost matrix in the target space
+ p : ndarray, shape (ns,)
+ Distribution in the source space.
+ q : ndarray, shape (nt,)
+ Distribution in the target space.
+ loss_fun : str
+ loss function used for the solver either 'square_loss' or 'kl_loss'
+ max_iter : int, optional
+ Max number of iterations
+ tol : float, optional
+ Stop threshold on error (>0)
+ verbose : bool, optional
+ Print information along iterations
+ log : bool, optional
+ record log if True
+ armijo : bool, optional
+ If True the steps of the line-search is found via an armijo research. Else closed form is used.
+ If there is convergence issues use False.
+
+ Returns
+ -------
+ gw_dist : float
+ Gromov-Wasserstein distance
+ log : dict
+ convergence information and Coupling marix
+
+ References
+ ----------
+ .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
+ "Gromov-Wasserstein averaging of kernel and distance matrices."
+ International Conference on Machine Learning (ICML). 2016.
+
+ .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
+ metric approach to object matching. Foundations of computational
+ mathematics 11.4 (2011): 417-487.
+
+ """
+
+ constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
+
+ G0 = p[:, None] * q[None, :]
+
+ def f(G):
+ return gwloss(constC, hC1, hC2, G)
+
+ def df(G):
+ return gwggrad(constC, hC1, hC2, G)
+ res, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+ log_gw['gw_dist'] = gwloss(constC, hC1, hC2, res)
+ log_gw['T'] = res
+ if log:
+ return log_gw['gw_dist'], log_gw
+ else:
+ return log_gw['gw_dist']
+
+
def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
"""
Computes the FGW transport between two graphs see [24]
@@ -357,8 +433,7 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, where :
- M is the (ns,nt) metric cost matrix
- - :math:`f` is the regularization term ( and df is its gradient)
- - a and b are source and target weights (sum to 1)
+ - p and q are source and target weights (sum to 1)
- L is a loss function to account for the misfit between the similarity matrices
The algorithm used for solving the problem is conditional gradient as discussed in [24]_
@@ -377,17 +452,13 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, Distribution in the target space
loss_fun : str, optional
Loss function used for the solver
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- record log if True
+ alpha : float, optional
+ Trade-off parameter (0 < alpha < 1)
armijo : bool, optional
If True the steps of the line-search is found via an armijo research. Else closed form is used.
If there is convergence issues use False.
+ log : bool, optional
+ record log if True
**kwargs : dict
parameters can be directly passed to the ot.optim.cg solver
@@ -417,11 +488,11 @@ def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, return gwggrad(constC, hC1, hC2, G)
if log:
- res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+ res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
log['fgw_dist'] = log['loss'][::-1][0]
return res, log
else:
- return cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
+ return cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, log=False, **kwargs):
@@ -439,8 +510,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 where :
- M is the (ns,nt) metric cost matrix
- - :math:`f` is the regularization term ( and df is its gradient)
- - a and b are source and target weights (sum to 1)
+ - p and q are source and target weights (sum to 1)
- L is a loss function to account for the misfit between the similarity matrices
The algorithm used for solving the problem is conditional gradient as discussed in [1]_
@@ -458,17 +528,13 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 Distribution in the target space.
loss_fun : str, optional
Loss function used for the solver.
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- Record log if True.
+ alpha : float, optional
+ Trade-off parameter (0 < alpha < 1)
armijo : bool, optional
If True the steps of the line-search is found via an armijo research.
Else closed form is used. If there is convergence issues use False.
+ log : bool, optional
+ Record log if True.
**kwargs : dict
Parameters can be directly pased to the ot.optim.cg solver.
@@ -497,7 +563,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 def df(G):
return gwggrad(constC, hC1, hC2, G)
- res, log = cg(p, q, M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
+ res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
if log:
log['fgw_dist'] = log['loss'][::-1][0]
log['T'] = res
@@ -506,84 +572,6 @@ def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5 return log['fgw_dist']
-def gromov_wasserstein2(C1, C2, p, q, loss_fun, log=False, armijo=False, **kwargs):
- """
- Returns the gromov-wasserstein discrepancy between (C1,p) and (C2,q)
-
- The function solves the following optimization problem:
-
- .. math::
- GW = \min_T \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})*T_{i,j}*T_{k,l}
-
- Where :
- - C1 : Metric cost matrix in the source space
- - C2 : Metric cost matrix in the target space
- - p : distribution in the source space
- - q : distribution in the target space
- - L : loss function to account for the misfit between the similarity matrices
- - H : entropy
-
- Parameters
- ----------
- C1 : ndarray, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : ndarray, shape (nt, nt)
- Metric cost matrix in the target space
- p : ndarray, shape (ns,)
- Distribution in the source space.
- q : ndarray, shape (nt,)
- Distribution in the target space.
- loss_fun : str
- loss function used for the solver either 'square_loss' or 'kl_loss'
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- record log if True
- armijo : bool, optional
- If True the steps of the line-search is found via an armijo research. Else closed form is used.
- If there is convergence issues use False.
-
- Returns
- -------
- gw_dist : float
- Gromov-Wasserstein distance
- log : dict
- convergence information and Coupling marix
-
- References
- ----------
- .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
- metric approach to object matching. Foundations of computational
- mathematics 11.4 (2011): 417-487.
-
- """
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- G0 = p[:, None] * q[None, :]
-
- def f(G):
- return gwloss(constC, hC1, hC2, G)
-
- def df(G):
- return gwggrad(constC, hC1, hC2, G)
- res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
- log['gw_dist'] = gwloss(constC, hC1, hC2, res)
- log['T'] = res
- if log:
- return log['gw_dist'], log
- else:
- return log['gw_dist']
-
-
def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
max_iter=1000, tol=1e-9, verbose=False, log=False):
"""
@@ -996,6 +984,16 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ Whether to fix the structure of the barycenter during the updates
fixed_features : bool
Whether to fix the feature of the barycenter during the updates
+ loss_fun : str
+ Loss function used for the solver either 'square_loss' or 'kl_loss'
+ max_iter : int, optional
+ Max number of iterations
+ tol : float, optional
+ Stop threshol on error (>0).
+ verbose : bool, optional
+ Print information along iterations.
+ log : bool, optional
+ Record log if True.
init_C : ndarray, shape (N,N), optional
Initialization for the barycenters' structure matrix. If not set
a random init is used.
@@ -1084,7 +1082,7 @@ def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_ T_temp = [t.T for t in T]
C = update_sructure_matrix(p, lambdas, T_temp, Cs)
- T = [fused_gromov_wasserstein((1 - alpha) * Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
+ T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
# T is N,ns
diff --git a/ot/lp/EMD.h b/ot/lp/EMD.h index f42e222..c0fe7a3 100644 --- a/ot/lp/EMD.h +++ b/ot/lp/EMD.h @@ -32,4 +32,6 @@ enum ProblemType { int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter); + + #endif diff --git a/ot/lp/EMD_wrapper.cpp b/ot/lp/EMD_wrapper.cpp index fc7ca63..bc873ed 100644 --- a/ot/lp/EMD_wrapper.cpp +++ b/ot/lp/EMD_wrapper.cpp @@ -17,13 +17,13 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) { -// beware M and C anre strored in row major C style!!! - int n, m, i, cur; + // beware M and C anre strored in row major C style!!! + int n, m, i, cur; typedef FullBipartiteDigraph Digraph; - DIGRAPH_TYPEDEFS(FullBipartiteDigraph); + DIGRAPH_TYPEDEFS(FullBipartiteDigraph); - // Get the number of non zero coordinates for r and c + // Get the number of non zero coordinates for r and c n=0; for (int i=0; i<n1; i++) { double val=*(X+i); @@ -105,3 +105,4 @@ int EMD_wrap(int n1, int n2, double *X, double *Y, double *D, double *G, return ret; } + diff --git a/ot/lp/__init__.py b/ot/lp/__init__.py index 0c92810..514a607 100644 --- a/ot/lp/__init__.py +++ b/ot/lp/__init__.py @@ -2,8 +2,6 @@ """ Solvers for the original linear program OT problem - - """ # Author: Remi Flamary <remi.flamary@unice.fr> @@ -12,22 +10,169 @@ Solvers for the original linear program OT problem import multiprocessing import sys + import numpy as np from scipy.sparse import coo_matrix -from .import cvx - +from . import cvx +from .cvx import barycenter # import compiled emd from .emd_wrap import emd_c, check_result, emd_1d_sorted -from ..utils import parmap -from .cvx import barycenter from ..utils import dist +from ..utils import parmap + +__all__ = ['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', + 'emd_1d', 'emd2_1d', 'wasserstein_1d'] + + +def center_ot_dual(alpha0, beta0, a=None, b=None): + r"""Center dual OT potentials w.r.t. theirs weights + + The main idea of this function is to find unique dual potentials + that ensure some kind of centering/fairness. The main idea is to find dual potentials that lead to the same final objective value for both source and targets (see below for more details). It will help having + stability when multiple calling of the OT solver with small changes. + + Basically we add another constraint to the potential that will not + change the objective value but will ensure unicity. The constraint + is the following: + + .. math:: + \alpha^T a= \beta^T b + + in addition to the OT problem constraints. + + since :math:`\sum_i a_i=\sum_j b_j` this can be solved by adding/removing + a constant from both :math:`\alpha_0` and :math:`\beta_0`. + + .. math:: + c=\frac{\beta0^T b-\alpha_0^T a}{1^Tb+1^Ta} + + \alpha=\alpha_0+c + + \beta=\beta0+c + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source histogram (uniform weight if empty list) + b : (nt,) numpy.ndarray, float64 + Target histogram (uniform weight if empty list) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source centered dual potential + beta : (nt,) numpy.ndarray, float64 + Target centered dual potential + + """ + # if no weights are provided, use uniform + if a is None: + a = np.ones(alpha0.shape[0]) / alpha0.shape[0] + if b is None: + b = np.ones(beta0.shape[0]) / beta0.shape[0] + + # compute constant that balances the weighted sums of the duals + c = (b.dot(beta0) - a.dot(alpha0)) / (a.sum() + b.sum()) + + # update duals + alpha = alpha0 + c + beta = beta0 - c + + return alpha, beta + + +def estimate_dual_null_weights(alpha0, beta0, a, b, M): + r"""Estimate feasible values for 0-weighted dual potentials + + The feasible values are computed efficiently but rather coarsely. + + .. warning:: + This function is necessary because the C++ solver in emd_c + discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + + First we compute the constraints violations: + + .. math:: + V=\alpha+\beta^T-M + + Next we compute the max amount of violation per row (alpha) and + columns (beta) -__all__=['emd', 'emd2', 'barycenter', 'free_support_barycenter', 'cvx', - 'emd_1d', 'emd2_1d', 'wasserstein_1d'] + .. math:: + v^a_i=\max_j V_{i,j} + + v^b_j=\max_i V_{i,j} + + Finally we update the dual potential with 0 weights if a + constraint is violated + + .. math:: + \alpha_i = \alpha_i -v^a_i \quad \text{ if } a_i=0 \text{ and } v^a_i>0 + + \beta_j = \beta_j -v^b_j \quad \text{ if } b_j=0 \text{ and } v^b_j>0 + + In the end the dual potentials are centered using function + :ref:`center_ot_dual`. + + Note that all those updates do not change the objective value of the + solution but provide dual potentials that do not violate the constraints. + + Parameters + ---------- + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + alpha0 : (ns,) numpy.ndarray, float64 + Source dual potential + beta0 : (nt,) numpy.ndarray, float64 + Target dual potential + a : (ns,) numpy.ndarray, float64 + Source distribution (uniform weights if empty list) + b : (nt,) numpy.ndarray, float64 + Target distribution (uniform weights if empty list) + M : (ns,nt) numpy.ndarray, float64 + Loss matrix (c-order array with type float64) + + Returns + ------- + alpha : (ns,) numpy.ndarray, float64 + Source corrected dual potential + beta : (nt,) numpy.ndarray, float64 + Target corrected dual potential + + """ + + # binary indexing of non-zeros weights + asel = a != 0 + bsel = b != 0 + + # compute dual constraints violation + constraint_violation = alpha0[:, None] + beta0[None, :] - M + + # Compute largest violation per line and columns + aviol = np.max(constraint_violation, 1) + bviol = np.max(constraint_violation, 0) + # update corrects violation of + alpha_up = -1 * ~asel * np.maximum(aviol, 0) + beta_up = -1 * ~bsel * np.maximum(bviol, 0) -def emd(a, b, M, numItermax=100000, log=False): + alpha = alpha0 + alpha_up + beta = beta0 + beta_up + + return center_ot_dual(alpha, beta, a, b) + + +def emd(a, b, M, numItermax=100000, log=False, center_dual=True): r"""Solves the Earth Movers distance problem and returns the OT matrix @@ -35,7 +180,9 @@ def emd(a, b, M, numItermax=100000, log=False): \gamma = arg\min_\gamma <\gamma,M>_F s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 where : @@ -43,7 +190,7 @@ def emd(a, b, M, numItermax=100000, log=False): - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -62,6 +209,9 @@ def emd(a, b, M, numItermax=100000, log=False): log: bool, optional (default=False) If True, returns a dictionary containing the cost and dual variables. Otherwise returns only the optimal transportation matrix. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -109,7 +259,20 @@ def emd(a, b, M, numItermax=100000, log=False): if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + + asel = a != 0 + bsel = b != 0 + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + result_code_string = check_result(result_code) if log: log = {} @@ -123,14 +286,17 @@ def emd(a, b, M, numItermax=100000, log=False): def emd2(a, b, M, processes=multiprocessing.cpu_count(), - numItermax=100000, log=False, return_matrix=False): + numItermax=100000, log=False, return_matrix=False, + center_dual=True): r"""Solves the Earth Movers distance problem and returns the loss .. math:: - \gamma = arg\min_\gamma <\gamma,M>_F + \min_\gamma <\gamma,M>_F s.t. \gamma 1 = a + \gamma^T 1= b + \gamma\geq 0 where : @@ -138,7 +304,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), - a and b are the sample weights .. warning:: - Note that the M matrix needs to be a C-order numpy.array in float64 + Note that the M matrix needs to be a C-order numpy.array in float64 format. Uses the algorithm proposed in [1]_ @@ -161,6 +327,9 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), variables. Otherwise returns only the optimal transportation cost. return_matrix: boolean, optional (default=False) If True, returns the optimal transportation matrix in the log. + center_dual: boolean, optional (default=True) + If True, centers the dual potential using function + :ref:`center_ot_dual`. Returns ------- @@ -204,7 +373,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), # problem with pikling Forks if sys.platform.endswith('win32'): - processes=1 + processes = 1 # if empty array given then use uniform distributions if len(a) == 0: @@ -212,21 +381,43 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), if len(b) == 0: b = np.ones((M.shape[1],), dtype=np.float64) / M.shape[1] + assert (a.shape[0] == M.shape[0] and b.shape[0] == M.shape[1]), \ + "Dimension mismatch, check dimensions of M with a and b" + + asel = a != 0 + if log or return_matrix: def f(b): - G, cost, u, v, resultCode = emd_c(a, b, M, numItermax) - result_code_string = check_result(resultCode) + bsel = b != 0 + + G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + + result_code_string = check_result(result_code) log = {} if return_matrix: log['G'] = G log['u'] = u log['v'] = v log['warning'] = result_code_string - log['result_code'] = resultCode + log['result_code'] = result_code return [cost, log] else: def f(b): + bsel = b != 0 G, cost, u, v, result_code = emd_c(a, b, M, numItermax) + + if center_dual: + u, v = center_ot_dual(u, v, a, b) + + if np.any(~asel) or np.any(~bsel): + u, v = estimate_dual_null_weights(u, v, a, b, M) + check_result(result_code) return cost @@ -234,7 +425,7 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return f(b) nb = b.shape[1] - if processes>1: + if processes > 1: res = parmap(f, [b[:, i] for i in range(nb)], processes) else: res = list(map(f, [b[:, i].copy() for i in range(nb)])) @@ -242,8 +433,8 @@ def emd2(a, b, M, processes=multiprocessing.cpu_count(), return res - -def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, stopThr=1e-7, verbose=False, log=None): +def free_support_barycenter(measures_locations, measures_weights, X_init, b=None, weights=None, numItermax=100, + stopThr=1e-7, verbose=False, log=None): """ Solves the free support (locations of the barycenters are optimized, not the weights) Wasserstein barycenter problem (i.e. the weighted Frechet mean for the 2-Wasserstein distance) @@ -295,7 +486,7 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None k = X_init.shape[0] d = X_init.shape[1] if b is None: - b = np.ones((k,))/k + b = np.ones((k,)) / k if weights is None: weights = np.ones((N,)) / N @@ -306,17 +497,17 @@ def free_support_barycenter(measures_locations, measures_weights, X_init, b=None displacement_square_norm = stopThr + 1. - while ( displacement_square_norm > stopThr and iter_count < numItermax ): + while (displacement_square_norm > stopThr and iter_count < numItermax): T_sum = np.zeros((k, d)) - for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, weights.tolist()): - + for (measure_locations_i, measure_weights_i, weight_i) in zip(measures_locations, measures_weights, + weights.tolist()): M_i = dist(X, measure_locations_i) T_i = emd(b, measure_weights_i, M_i) T_sum = T_sum + weight_i * np.reshape(1. / b, (-1, 1)) * np.matmul(T_i, measure_locations_i) - displacement_square_norm = np.sum(np.square(T_sum-X)) + displacement_square_norm = np.sum(np.square(T_sum - X)) if log: displacement_square_norms.append(displacement_square_norm) @@ -436,12 +627,12 @@ def emd_1d(x_a, x_b, a=None, b=None, metric='sqeuclidean', p=1., dense=True, if b.ndim == 0 or len(b) == 0: b = np.ones((x_b.shape[0],), dtype=np.float64) / x_b.shape[0] - x_a_1d = x_a.reshape((-1, )) - x_b_1d = x_b.reshape((-1, )) + x_a_1d = x_a.reshape((-1,)) + x_b_1d = x_b.reshape((-1,)) perm_a = np.argsort(x_a_1d) perm_b = np.argsort(x_b_1d) - G_sorted, indices, cost = emd_1d_sorted(a, b, + G_sorted, indices, cost = emd_1d_sorted(a[perm_a], b[perm_b], x_a_1d[perm_a], x_b_1d[perm_b], metric=metric, p=p) G = coo_matrix((G_sorted, (perm_a[indices[:, 0]], perm_b[indices[:, 1]])), diff --git a/ot/lp/emd_wrap.pyx b/ot/lp/emd_wrap.pyx index 2b6c495..c167964 100644 --- a/ot/lp/emd_wrap.pyx +++ b/ot/lp/emd_wrap.pyx @@ -19,7 +19,7 @@ import warnings cdef extern from "EMD.h": - int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) + int EMD_wrap(int n1,int n2, double *X, double *Y,double *D, double *G, double* alpha, double* beta, double *cost, int maxIter) nogil cdef enum ProblemType: INFEASIBLE, OPTIMAL, UNBOUNDED, MAX_ITER_REACHED @@ -35,8 +35,7 @@ def check_result(result_code): message = "numItermax reached before optimality. Try to increase numItermax." warnings.warn(message) return message - - + @cython.boundscheck(False) @cython.wraparound(False) def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mode="c"] b, np.ndarray[double, ndim=2, mode="c"] M, int max_iter): @@ -61,6 +60,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod .. warning:: Note that the M matrix needs to be a C-order :py.cls:`numpy.array` + .. warning:: + The C++ solver discards all samples in the distributions with + zeros weights. This means that while the primal variable (transport + matrix) is exact, the solver only returns feasible dual potentials + on the samples with weights different from zero. + Parameters ---------- a : (ns,) numpy.ndarray, float64 @@ -73,7 +78,6 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod The maximum number of iterations before stopping the optimization algorithm if it has not converged. - Returns ------- gamma: (ns x nt) numpy.ndarray @@ -82,12 +86,19 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod """ cdef int n1= M.shape[0] cdef int n2= M.shape[1] + cdef int nmax=n1+n2-1 + cdef int result_code = 0 + cdef int nG=0 cdef double cost=0 - cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([n1, n2]) cdef np.ndarray[double, ndim=1, mode="c"] alpha=np.zeros(n1) cdef np.ndarray[double, ndim=1, mode="c"] beta=np.zeros(n2) + cdef np.ndarray[double, ndim=2, mode="c"] G=np.zeros([0, 0]) + + cdef np.ndarray[double, ndim=1, mode="c"] Gv=np.zeros(0) + cdef np.ndarray[long, ndim=1, mode="c"] iG=np.zeros(0,dtype=np.int) + cdef np.ndarray[long, ndim=1, mode="c"] jG=np.zeros(0,dtype=np.int) if not len(a): a=np.ones((n1,))/n1 @@ -95,8 +106,12 @@ def emd_c(np.ndarray[double, ndim=1, mode="c"] a, np.ndarray[double, ndim=1, mod if not len(b): b=np.ones((n2,))/n2 + # init OT matrix + G=np.zeros([n1, n2]) + # calling the function - cdef int result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter) + with nogil: + result_code = EMD_wrap(n1, n2, <double*> a.data, <double*> b.data, <double*> M.data, <double*> G.data, <double*> alpha.data, <double*> beta.data, <double*> &cost, max_iter) return G, cost, alpha, beta, result_code diff --git a/ot/lp/network_simplex_simple.h b/ot/lp/network_simplex_simple.h index 7c6a4ce..5d93040 100644 --- a/ot/lp/network_simplex_simple.h +++ b/ot/lp/network_simplex_simple.h @@ -686,7 +686,7 @@ namespace lemon { /// \see resetParams(), reset() ProblemType run() { #if DEBUG_LVL>0 - std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED\n"; + std::cout << "OPTIMAL = " << OPTIMAL << "\nINFEASIBLE = " << INFEASIBLE << "\nUNBOUNDED = " << UNBOUNDED << "\nMAX_ITER_REACHED" << MAX_ITER_REACHED << "\n" ; #endif if (!init()) return INFEASIBLE; @@ -875,7 +875,7 @@ namespace lemon { c += Number(it->second) * Number(_cost[it->first]); return c;*/ - for (int i=0; i<_flow.size(); i++) + for (unsigned long i=0; i<_flow.size(); i++) c += _flow[i] * Number(_cost[i]); return c; @@ -1257,7 +1257,7 @@ namespace lemon { u = w; } _pred[u_in] = in_arc; - _forward[u_in] = (u_in == _source[in_arc]); + _forward[u_in] = ((unsigned int)u_in == _source[in_arc]); _succ_num[u_in] = old_succ_num; // Set limits for updating _last_succ form v_in and v_out @@ -1418,7 +1418,6 @@ namespace lemon { template <typename PivotRuleImpl> ProblemType start() { PivotRuleImpl pivot(*this); - double prevCost=-1; ProblemType retVal = OPTIMAL; // Perform heuristic initial pivots diff --git a/ot/optim.py b/ot/optim.py index 0abd9e9..b9ca891 100644 --- a/ot/optim.py +++ b/ot/optim.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Optimization algorithms for OT +Generic solvers for regularized OT """ # Author: Remi Flamary <remi.flamary@unice.fr> @@ -134,7 +134,7 @@ def solve_linesearch(cost, G, deltaG, Mi, f_val, return alpha, fc, f_val -def cg(a, b, M, reg, f, df, G0=None, numItermax=200, +def cg(a, b, M, reg, f, df, G0=None, numItermax=200, numItermaxEmd=100000, stopThr=1e-9, stopThr2=1e-9, verbose=False, log=False, **kwargs): """ Solve the general regularized OT problem with conditional gradient @@ -172,6 +172,8 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, initial guess (default is indep joint density) numItermax : int, optional Max number of iterations + numItermaxEmd : int, optional + Max number of iterations for emd stopThr : float, optional Stop threshol on the relative variation (>0) stopThr2 : float, optional @@ -238,7 +240,7 @@ def cg(a, b, M, reg, f, df, G0=None, numItermax=200, Mi += Mi.min() # solve linear program - Gc = emd(a, b, Mi) + Gc = emd(a, b, Mi, numItermax=numItermaxEmd) deltaG = Gc - G diff --git a/ot/partial.py b/ot/partial.py new file mode 100755 index 0000000..eb707d8 --- /dev/null +++ b/ot/partial.py @@ -0,0 +1,1062 @@ +# -*- coding: utf-8 -*- +""" +Partial OT solvers +""" + +# Author: Laetitia Chapel <laetitia.chapel@irisa.fr> +# License: MIT License + +import numpy as np + +from .lp import emd + + +def partial_wasserstein_lagrange(a, b, M, reg_m=None, nb_dummies=1, log=False, + **kwargs): + r""" + Solves the partial optimal transport problem for the quadratic cost + and returns the OT plan + + The function considers the following problem: + + .. math:: + \gamma = \arg\min_\gamma <\gamma,(M-\lambda)>_F + + s.t. + \gamma\geq 0 \\ + \gamma 1 \leq a\\ + \gamma^T 1 \leq b\\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + + + or equivalently (see Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. + (2018). An interpolating distance between optimal transport and Fisher–Rao + metrics. Foundations of Computational Mathematics, 18(1), 1-44.) + + .. math:: + \gamma = \arg\min_\gamma <\gamma,M>_F + \sqrt(\lambda/2) + (\|\gamma 1 - a\|_1 + \|\gamma^T 1 - b\|_1) + + s.t. + \gamma\geq 0 \\ + + + where : + + - M is the metric cost matrix + - a and b are source and target unbalanced distributions + - :math:`\lambda` is the lagragian cost. Tuning its value allows attaining + a given mass to be transported m + + The formulation of the problem has been proposed in [28]_ + + + Parameters + ---------- + a : np.ndarray (dim_a,) + Unnormalized histogram of dimension dim_a + b : np.ndarray (dim_b,) + Unnormalized histograms of dimension dim_b + M : np.ndarray (dim_a, dim_b) + cost matrix for the quadratic cost + reg_m : float, optional + Lagragian cost + nb_dummies : int, optional, default:1 + number of reservoir points to be added (to avoid numerical + instabilities, increase its value if an error is raised) + log : bool, optional + record log if True + **kwargs : dict + parameters can be directly passed to the emd solver + + .. warning:: + When dealing with a large number of points, the EMD solver may face + some instabilities, especially when the mass associated to the dummy + point is large. To avoid them, increase the number of dummy points + (allows a smoother repartition of the mass over the points). + + Returns + ------- + gamma : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + + >>> import ot + >>> a = [.1, .2] + >>> b = [.1, .1] + >>> M = [[0., 1.], [2., 3.]] + >>> np.round(partial_wasserstein_lagrange(a,b,M), 2) + array([[0.1, 0. ], + [0. , 0.1]]) + >>> np.round(partial_wasserstein_lagrange(a,b,M,reg_m=2), 2) + array([[0.1, 0. ], + [0. , 0. ]]) + + References + ---------- + + .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in + optimal transport and Monge-Ampere obstacle problems. Annals of + mathematics, 673-730. + + See Also + -------- + ot.partial.partial_wasserstein : Partial Wasserstein with fixed mass + """ + + if np.sum(a) > 1 or np.sum(b) > 1: + raise ValueError("Problem infeasible. Check that a and b are in the " + "simplex") + + if reg_m is None: + reg_m = np.max(M) + 1 + if reg_m < -np.max(M): + return np.zeros((len(a), len(b))) + + eps = 1e-20 + M = np.asarray(M, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + a = np.asarray(a, dtype=np.float64) + + M_star = M - reg_m # modified cost matrix + + # trick to fasten the computation: select only the subset of columns/lines + # that can have marginals greater than 0 (that is to say M < 0) + idx_x = np.where(np.min(M_star, axis=1) < eps)[0] + idx_y = np.where(np.min(M_star, axis=0) < eps)[0] + + # extend a, b, M with "reservoir" or "dummy" points + M_extended = np.zeros((len(idx_x) + nb_dummies, len(idx_y) + nb_dummies)) + M_extended[:len(idx_x), :len(idx_y)] = M_star[np.ix_(idx_x, idx_y)] + + a_extended = np.append(a[idx_x], [(np.sum(a) - np.sum(a[idx_x]) + + np.sum(b)) / nb_dummies] * nb_dummies) + b_extended = np.append(b[idx_y], [(np.sum(b) - np.sum(b[idx_y]) + + np.sum(a)) / nb_dummies] * nb_dummies) + + gamma_extended, log_emd = emd(a_extended, b_extended, M_extended, log=True, + **kwargs) + gamma = np.zeros((len(a), len(b))) + gamma[np.ix_(idx_x, idx_y)] = gamma_extended[:-nb_dummies, :-nb_dummies] + + if log_emd['warning'] is not None: + raise ValueError("Error in the EMD resolution: try to increase the" + " number of dummy points") + log_emd['cost'] = np.sum(gamma * M) + if log: + return gamma, log_emd + else: + return gamma + + +def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): + r""" + Solves the partial optimal transport problem for the quadratic cost + and returns the OT plan + + The function considers the following problem: + + .. math:: + \gamma = \arg\min_\gamma <\gamma,M>_F + + s.t. + \gamma\geq 0 \\ + \gamma 1 \leq a\\ + \gamma^T 1 \leq b\\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + + + where : + + - M is the metric cost matrix + - a and b are source and target unbalanced distributions + - m is the amount of mass to be transported + + Parameters + ---------- + a : np.ndarray (dim_a,) + Unnormalized histogram of dimension dim_a + b : np.ndarray (dim_b,) + Unnormalized histograms of dimension dim_b + M : np.ndarray (dim_a, dim_b) + cost matrix for the quadratic cost + m : float, optional + amount of mass to be transported + nb_dummies : int, optional, default:1 + number of reservoir points to be added (to avoid numerical + instabilities, increase its value if an error is raised) + log : bool, optional + record log if True + **kwargs : dict + parameters can be directly passed to the emd solver + + + .. warning:: + When dealing with a large number of points, the EMD solver may face + some instabilities, especially when the mass associated to the dummy + point is large. To avoid them, increase the number of dummy points + (allows a smoother repartition of the mass over the points). + + + Returns + ------- + :math:`gamma` : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + + >>> import ot + >>> a = [.1, .2] + >>> b = [.1, .1] + >>> M = [[0., 1.], [2., 3.]] + >>> np.round(partial_wasserstein(a,b,M), 2) + array([[0.1, 0. ], + [0. , 0.1]]) + >>> np.round(partial_wasserstein(a,b,M,m=0.1), 2) + array([[0.1, 0. ], + [0. , 0. ]]) + + References + ---------- + .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in + optimal transport and Monge-Ampere obstacle problems. Annals of + mathematics, 673-730. + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + + See Also + -------- + ot.partial.partial_wasserstein_lagrange: Partial Wasserstein with + regularization on the marginals + ot.partial.entropic_partial_wasserstein: Partial Wasserstein with a + entropic regularization parameter + """ + + if m is None: + return partial_wasserstein_lagrange(a, b, M, log=log, **kwargs) + elif m < 0: + raise ValueError("Problem infeasible. Parameter m should be greater" + " than 0.") + elif m > np.min((np.sum(a), np.sum(b))): + raise ValueError("Problem infeasible. Parameter m should lower or" + " equal than min(|a|_1, |b|_1).") + + b_extended = np.append(b, [(np.sum(a) - m) / nb_dummies] * nb_dummies) + a_extended = np.append(a, [(np.sum(b) - m) / nb_dummies] * nb_dummies) + M_extended = np.zeros((len(a_extended), len(b_extended))) + M_extended[-1, -1] = np.max(M) * 1e5 + M_extended[:len(a), :len(b)] = M + + gamma, log_emd = emd(a_extended, b_extended, M_extended, log=True, + **kwargs) + if log_emd['warning'] is not None: + raise ValueError("Error in the EMD resolution: try to increase the" + " number of dummy points") + log_emd['partial_w_dist'] = np.sum(M * gamma[:len(a), :len(b)]) + + if log: + return gamma[:len(a), :len(b)], log_emd + else: + return gamma[:len(a), :len(b)] + + +def partial_wasserstein2(a, b, M, m=None, nb_dummies=1, log=False, **kwargs): + r""" + Solves the partial optimal transport problem for the quadratic cost + and returns the partial GW discrepancy + + The function considers the following problem: + + .. math:: + \gamma = \arg\min_\gamma <\gamma,M>_F + + s.t. + \gamma\geq 0 \\ + \gamma 1 \leq a\\ + \gamma^T 1 \leq b\\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + + + where : + + - M is the metric cost matrix + - a and b are source and target unbalanced distributions + - m is the amount of mass to be transported + + Parameters + ---------- + a : np.ndarray (dim_a,) + Unnormalized histogram of dimension dim_a + b : np.ndarray (dim_b,) + Unnormalized histograms of dimension dim_b + M : np.ndarray (dim_a, dim_b) + cost matrix for the quadratic cost + m : float, optional + amount of mass to be transported + nb_dummies : int, optional, default:1 + number of reservoir points to be added (to avoid numerical + instabilities, increase its value if an error is raised) + log : bool, optional + record log if True + **kwargs : dict + parameters can be directly passed to the emd solver + + + .. warning:: + When dealing with a large number of points, the EMD solver may face + some instabilities, especially when the mass associated to the dummy + point is large. To avoid them, increase the number of dummy points + (allows a smoother repartition of the mass over the points). + + + Returns + ------- + :math:`gamma` : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + + >>> import ot + >>> a=[.1, .2] + >>> b=[.1, .1] + >>> M=[[0., 1.], [2., 3.]] + >>> np.round(partial_wasserstein2(a, b, M), 1) + 0.3 + >>> np.round(partial_wasserstein2(a,b,M,m=0.1), 1) + 0.0 + + References + ---------- + .. [28] Caffarelli, L. A., & McCann, R. J. (2010) Free boundaries in + optimal transport and Monge-Ampere obstacle problems. Annals of + mathematics, 673-730. + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + """ + + partial_gw, log_w = partial_wasserstein(a, b, M, m, nb_dummies, log=True, + **kwargs) + + log_w['T'] = partial_gw + + if log: + return np.sum(partial_gw * M), log_w + else: + return np.sum(partial_gw * M) + + +def gwgrad_partial(C1, C2, T): + """Compute the GW gradient. Note: we can not use the trick in [12]_ as + the marginals may not sum to 1. + + Parameters + ---------- + C1: array of shape (n_p,n_p) + intra-source (P) cost matrix + + C2: array of shape (n_u,n_u) + intra-target (U) cost matrix + + T : array of shape(n_p+nb_dummies, n_u) (default: None) + Transport matrix + + Returns + ------- + numpy.array of shape (n_p+nb_dummies, n_u) + gradient + + References + ---------- + .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + """ + cC1 = np.dot(C1 ** 2 / 2, np.dot(T, np.ones(C2.shape[0]).reshape(-1, 1))) + cC2 = np.dot(np.dot(np.ones(C1.shape[0]).reshape(1, -1), T), C2 ** 2 / 2) + constC = cC1 + cC2 + A = -np.dot(C1, T).dot(C2.T) + tens = constC + A + return tens * 2 + + +def gwloss_partial(C1, C2, T): + """Compute the GW loss. + + Parameters + ---------- + C1: array of shape (n_p,n_p) + intra-source (P) cost matrix + + C2: array of shape (n_u,n_u) + intra-target (U) cost matrix + + T : array of shape(n_p+nb_dummies, n_u) (default: None) + Transport matrix + + Returns + ------- + GW loss + """ + g = gwgrad_partial(C1, C2, T) * 0.5 + return np.sum(g * T) + + +def partial_gromov_wasserstein(C1, C2, p, q, m=None, nb_dummies=1, G0=None, + thres=1, numItermax=1000, tol=1e-7, + log=False, verbose=False, **kwargs): + r""" + Solves the partial optimal transport problem + and returns the OT plan + + The function considers the following problem: + + .. math:: + \gamma = arg\min_\gamma <\gamma,M>_F + + s.t. \gamma 1 \leq a \\ + \gamma^T 1 \leq b \\ + \gamma\geq 0 \\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + + where : + + - M is the metric cost matrix + - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma) + =\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - a and b are the sample weights + - m is the amount of mass to be transported + + The formulation of the problem has been proposed in [29]_ + + + Parameters + ---------- + C1 : ndarray, shape (ns, ns) + Metric cost matrix in the source space + C2 : ndarray, shape (nt, nt) + Metric costfr matrix in the target space + p : ndarray, shape (ns,) + Distribution in the source space + q : ndarray, shape (nt,) + Distribution in the target space + m : float, optional + Amount of mass to be transported (default: min (|p|_1, |q|_1)) + nb_dummies : int, optional + Number of dummy points to add (avoid instabilities in the EMD solver) + G0 : ndarray, shape (ns, nt), optional + Initialisation of the transportation matrix + thres : float, optional + quantile of the gradient matrix to populate the cost matrix when 0 + (default: 1) + numItermax : int, optional + Max number of iterations + tol : float, optional + tolerance for stopping iterations + log : bool, optional + return log if True + verbose : bool, optional + Print information along iterations + **kwargs : dict + parameters can be directly passed to the emd solver + + + Returns + ------- + gamma : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + >>> import ot + >>> import scipy as sp + >>> a = np.array([0.25] * 4) + >>> b = np.array([0.25] * 4) + >>> x = np.array([1,2,100,200]).reshape((-1,1)) + >>> y = np.array([3,2,98,199]).reshape((-1,1)) + >>> C1 = sp.spatial.distance.cdist(x, x) + >>> C2 = sp.spatial.distance.cdist(y, y) + >>> np.round(partial_gromov_wasserstein(C1, C2, a, b),2) + array([[0. , 0.25, 0. , 0. ], + [0.25, 0. , 0. , 0. ], + [0. , 0. , 0.25, 0. ], + [0. , 0. , 0. , 0.25]]) + >>> np.round(partial_gromov_wasserstein(C1, C2, a, b, m=0.25),2) + array([[0. , 0. , 0. , 0. ], + [0. , 0. , 0. , 0. ], + [0. , 0. , 0. , 0. ], + [0. , 0. , 0. , 0.25]]) + + References + ---------- + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + + """ + + if m is None: + m = np.min((np.sum(p), np.sum(q))) + elif m < 0: + raise ValueError("Problem infeasible. Parameter m should be greater" + " than 0.") + elif m > np.min((np.sum(p), np.sum(q))): + raise ValueError("Problem infeasible. Parameter m should lower or" + " equal than min(|a|_1, |b|_1).") + + if G0 is None: + G0 = np.outer(p, q) + + dim_G_extended = (len(p) + nb_dummies, len(q) + nb_dummies) + q_extended = np.append(q, [(np.sum(p) - m) / nb_dummies] * nb_dummies) + p_extended = np.append(p, [(np.sum(q) - m) / nb_dummies] * nb_dummies) + + cpt = 0 + err = 1 + eps = 1e-20 + if log: + log = {'err': []} + + while (err > tol and cpt < numItermax): + + Gprev = G0 + + M = gwgrad_partial(C1, C2, G0) + M[M < eps] = np.quantile(M, thres) + + M_emd = np.zeros(dim_G_extended) + M_emd[:len(p), :len(q)] = M + M_emd[-nb_dummies:, -nb_dummies:] = np.max(M) * 1e5 + M_emd = np.asarray(M_emd, dtype=np.float64) + + Gc, logemd = emd(p_extended, q_extended, M_emd, log=True, **kwargs) + + if logemd['warning'] is not None: + raise ValueError("Error in the EMD resolution: try to increase the" + " number of dummy points") + + G0 = Gc[:len(p), :len(q)] + + if cpt % 10 == 0: # to speed up the computations + err = np.linalg.norm(G0 - Gprev) + if log: + log['err'].append(err) + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}|{:12s}'.format( + 'It.', 'Err', 'Loss') + '\n' + '-' * 31) + print('{:5d}|{:8e}|{:8e}'.format(cpt, err, + gwloss_partial(C1, C2, G0))) + + cpt += 1 + + if log: + log['partial_gw_dist'] = gwloss_partial(C1, C2, G0) + return G0[:len(p), :len(q)], log + else: + return G0[:len(p), :len(q)] + + +def partial_gromov_wasserstein2(C1, C2, p, q, m=None, nb_dummies=1, G0=None, + thres=1, numItermax=1000, tol=1e-7, + log=False, verbose=False, **kwargs): + r""" + Solves the partial optimal transport problem + and returns the partial Gromov-Wasserstein discrepancy + + The function considers the following problem: + + .. math:: + \gamma = arg\min_\gamma <\gamma,M>_F + + s.t. \gamma 1 \leq a \\ + \gamma^T 1 \leq b \\ + \gamma\geq 0 \\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + + where : + + - M is the metric cost matrix + - :math:`\Omega` is the entropic regularization term + :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - a and b are the sample weights + - m is the amount of mass to be transported + + The formulation of the problem has been proposed in [29]_ + + + Parameters + ---------- + C1 : ndarray, shape (ns, ns) + Metric cost matrix in the source space + C2 : ndarray, shape (nt, nt) + Metric costfr matrix in the target space + p : ndarray, shape (ns,) + Distribution in the source space + q : ndarray, shape (nt,) + Distribution in the target space + m : float, optional + Amount of mass to be transported (default: min (|p|_1, |q|_1)) + nb_dummies : int, optional + Number of dummy points to add (avoid instabilities in the EMD solver) + G0 : ndarray, shape (ns, nt), optional + Initialisation of the transportation matrix + thres : float, optional + quantile of the gradient matrix to populate the cost matrix when 0 + (default: 1) + numItermax : int, optional + Max number of iterations + tol : float, optional + tolerance for stopping iterations + log : bool, optional + return log if True + verbose : bool, optional + Print information along iterations + **kwargs : dict + parameters can be directly passed to the emd solver + + + .. warning:: + When dealing with a large number of points, the EMD solver may face + some instabilities, especially when the mass associated to the dummy + point is large. To avoid them, increase the number of dummy points + (allows a smoother repartition of the mass over the points). + + + Returns + ------- + partial_gw_dist : (dim_a x dim_b) ndarray + partial GW discrepancy + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + >>> import ot + >>> import scipy as sp + >>> a = np.array([0.25] * 4) + >>> b = np.array([0.25] * 4) + >>> x = np.array([1,2,100,200]).reshape((-1,1)) + >>> y = np.array([3,2,98,199]).reshape((-1,1)) + >>> C1 = sp.spatial.distance.cdist(x, x) + >>> C2 = sp.spatial.distance.cdist(y, y) + >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b),2) + 1.69 + >>> np.round(partial_gromov_wasserstein2(C1, C2, a, b, m=0.25),2) + 0.0 + + References + ---------- + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + + """ + + partial_gw, log_gw = partial_gromov_wasserstein(C1, C2, p, q, m, + nb_dummies, G0, thres, + numItermax, tol, True, + verbose, **kwargs) + + log_gw['T'] = partial_gw + + if log: + return log_gw['partial_gw_dist'], log_gw + else: + return log_gw['partial_gw_dist'] + + +def entropic_partial_wasserstein(a, b, M, reg, m=None, numItermax=1000, + stopThr=1e-100, verbose=False, log=False): + r""" + Solves the partial optimal transport problem + and returns the OT plan + + The function considers the following problem: + + .. math:: + \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) + + s.t. \gamma 1 \leq a \\ + \gamma^T 1 \leq b \\ + \gamma\geq 0 \\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} \\ + + where : + + - M is the metric cost matrix + - :math:`\Omega` is the entropic regularization term + :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - a and b are the sample weights + - m is the amount of mass to be transported + + The formulation of the problem has been proposed in [3]_ (prop. 5) + + + Parameters + ---------- + a : np.ndarray (dim_a,) + Unnormalized histogram of dimension dim_a + b : np.ndarray (dim_b,) + Unnormalized histograms of dimension dim_b + M : np.ndarray (dim_a, dim_b) + cost matrix + reg : float + Regularization term > 0 + m : float, optional + Amount of mass to be transported + numItermax : int, optional + Max number of iterations + stopThr : float, optional + Stop threshold on error (>0) + verbose : bool, optional + Print information along iterations + log : bool, optional + record log if True + + + Returns + ------- + gamma : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + + Examples + -------- + >>> import ot + >>> a = [.1, .2] + >>> b = [.1, .1] + >>> M = [[0., 1.], [2., 3.]] + >>> np.round(entropic_partial_wasserstein(a, b, M, 1, 0.1), 2) + array([[0.06, 0.02], + [0.01, 0. ]]) + + References + ---------- + .. [3] Benamou, J. D., Carlier, G., Cuturi, M., Nenna, L., & Peyré, G. + (2015). Iterative Bregman projections for regularized transportation + problems. SIAM Journal on Scientific Computing, 37(2), A1111-A1138. + + See Also + -------- + ot.partial.partial_wasserstein: exact Partial Wasserstein + """ + + a = np.asarray(a, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + M = np.asarray(M, dtype=np.float64) + + dim_a, dim_b = M.shape + dx = np.ones(dim_a, dtype=np.float64) + dy = np.ones(dim_b, dtype=np.float64) + + if len(a) == 0: + a = np.ones(dim_a, dtype=np.float64) / dim_a + if len(b) == 0: + b = np.ones(dim_b, dtype=np.float64) / dim_b + + if m is None: + m = np.min((np.sum(a), np.sum(b))) * 1.0 + if m < 0: + raise ValueError("Problem infeasible. Parameter m should be greater" + " than 0.") + if m > np.min((np.sum(a), np.sum(b))): + raise ValueError("Problem infeasible. Parameter m should lower or" + " equal than min(|a|_1, |b|_1).") + + log_e = {'err': []} + + # Next 3 lines equivalent to K=np.exp(-M/reg), but faster to compute + K = np.empty(M.shape, dtype=M.dtype) + np.divide(M, -reg, out=K) + np.exp(K, out=K) + np.multiply(K, m / np.sum(K), out=K) + + err, cpt = 1, 0 + + while (err > stopThr and cpt < numItermax): + Kprev = K + K1 = np.dot(np.diag(np.minimum(a / np.sum(K, axis=1), dx)), K) + K2 = np.dot(K1, np.diag(np.minimum(b / np.sum(K1, axis=0), dy))) + K = K2 * (m / np.sum(K2)) + + if np.any(np.isnan(K)) or np.any(np.isinf(K)): + print('Warning: numerical errors at iteration', cpt) + break + if cpt % 10 == 0: + err = np.linalg.norm(Kprev - K) + if log: + log_e['err'].append(err) + if verbose: + if cpt % 200 == 0: + print( + '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 11) + print('{:5d}|{:8e}|'.format(cpt, err)) + + cpt = cpt + 1 + log_e['partial_w_dist'] = np.sum(M * K) + if log: + return K, log_e + else: + return K + + +def entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, m=None, G0=None, + numItermax=1000, tol=1e-7, log=False, + verbose=False): + r""" + Returns the partial Gromov-Wasserstein transport between (C1,p) and (C2,q) + + The function solves the following optimization problem: + + .. math:: + GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot + \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma) + + s.t. + \gamma\geq 0 \\ + \gamma 1 \leq a\\ + \gamma^T 1 \leq b\\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + + where : + + - C1 is the metric cost matrix in the source space + - C2 is the metric cost matrix in the target space + - p and q are the sample weights + - L : quadratic loss function + - :math:`\Omega` is the entropic regularization term + :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - m is the amount of mass to be transported + + The formulation of the GW problem has been proposed in [12]_ and the + partial GW in [29]_. + + Parameters + ---------- + C1 : ndarray, shape (ns, ns) + Metric cost matrix in the source space + C2 : ndarray, shape (nt, nt) + Metric costfr matrix in the target space + p : ndarray, shape (ns,) + Distribution in the source space + q : ndarray, shape (nt,) + Distribution in the target space + reg: float + entropic regularization parameter + m : float, optional + Amount of mass to be transported (default: min (|p|_1, |q|_1)) + G0 : ndarray, shape (ns, nt), optional + Initialisation of the transportation matrix + numItermax : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error (>0) + log : bool, optional + return log if True + verbose : bool, optional + Print information along iterations + + Examples + -------- + >>> import ot + >>> import scipy as sp + >>> a = np.array([0.25] * 4) + >>> b = np.array([0.25] * 4) + >>> x = np.array([1,2,100,200]).reshape((-1,1)) + >>> y = np.array([3,2,98,199]).reshape((-1,1)) + >>> C1 = sp.spatial.distance.cdist(x, x) + >>> C2 = sp.spatial.distance.cdist(y, y) + >>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b,50), 2) + array([[0.12, 0.13, 0. , 0. ], + [0.13, 0.12, 0. , 0. ], + [0. , 0. , 0.25, 0. ], + [0. , 0. , 0. , 0.25]]) + >>> np.round(entropic_partial_gromov_wasserstein(C1, C2, a, b, 50, m=0.25), 2) + array([[0.02, 0.03, 0. , 0.03], + [0.03, 0.03, 0. , 0.03], + [0. , 0. , 0.03, 0. ], + [0.02, 0.02, 0. , 0.03]]) + + Returns + ------- + :math: `gamma` : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + + References + ---------- + .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + + See Also + -------- + ot.partial.partial_gromov_wasserstein: exact Partial Gromov-Wasserstein + """ + + if G0 is None: + G0 = np.outer(p, q) + + if m is None: + m = np.min((np.sum(p), np.sum(q))) + elif m < 0: + raise ValueError("Problem infeasible. Parameter m should be greater" + " than 0.") + elif m > np.min((np.sum(p), np.sum(q))): + raise ValueError("Problem infeasible. Parameter m should lower or" + " equal than min(|a|_1, |b|_1).") + + cpt = 0 + err = 1 + + loge = {'err': []} + + while (err > tol and cpt < numItermax): + Gprev = G0 + M_entr = gwgrad_partial(C1, C2, G0) + G0 = entropic_partial_wasserstein(p, q, M_entr, reg, m) + if cpt % 10 == 0: # to speed up the computations + err = np.linalg.norm(G0 - Gprev) + if log: + loge['err'].append(err) + if verbose: + if cpt % 200 == 0: + print('{:5s}|{:12s}|{:12s}'.format( + 'It.', 'Err', 'Loss') + '\n' + '-' * 31) + print('{:5d}|{:8e}|{:8e}'.format(cpt, err, + gwloss_partial(C1, C2, G0))) + + cpt += 1 + + if log: + loge['partial_gw_dist'] = gwloss_partial(C1, C2, G0) + return G0, loge + else: + return G0 + + +def entropic_partial_gromov_wasserstein2(C1, C2, p, q, reg, m=None, G0=None, + numItermax=1000, tol=1e-7, log=False, + verbose=False): + r""" + Returns the partial Gromov-Wasserstein discrepancy between (C1,p) and + (C2,q) + + The function solves the following optimization problem: + + .. math:: + GW = \arg\min_{\gamma} \sum_{i,j,k,l} L(C1_{i,k},C2_{j,l})\cdot + \gamma_{i,j}\cdot\gamma_{k,l} + reg\cdot\Omega(\gamma) + + s.t. + \gamma\geq 0 \\ + \gamma 1 \leq a\\ + \gamma^T 1 \leq b\\ + 1^T \gamma^T 1 = m \leq \min\{\|a\|_1, \|b\|_1\} + + where : + + - C1 is the metric cost matrix in the source space + - C2 is the metric cost matrix in the target space + - p and q are the sample weights + - L : quadratic loss function + - :math:`\Omega` is the entropic regularization term + :math:`\Omega=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` + - m is the amount of mass to be transported + + The formulation of the GW problem has been proposed in [12]_ and the + partial GW in [29]_. + + + Parameters + ---------- + C1 : ndarray, shape (ns, ns) + Metric cost matrix in the source space + C2 : ndarray, shape (nt, nt) + Metric costfr matrix in the target space + p : ndarray, shape (ns,) + Distribution in the source space + q : ndarray, shape (nt,) + Distribution in the target space + reg: float + entropic regularization parameter + m : float, optional + Amount of mass to be transported (default: min (|p|_1, |q|_1)) + G0 : ndarray, shape (ns, nt), optional + Initialisation of the transportation matrix + numItermax : int, optional + Max number of iterations + tol : float, optional + Stop threshold on error (>0) + log : bool, optional + return log if True + verbose : bool, optional + Print information along iterations + + + Returns + ------- + partial_gw_dist: float + Gromov-Wasserstein distance + log : dict + log dictionary returned only if `log` is `True` + + Examples + -------- + >>> import ot + >>> import scipy as sp + >>> a = np.array([0.25] * 4) + >>> b = np.array([0.25] * 4) + >>> x = np.array([1,2,100,200]).reshape((-1,1)) + >>> y = np.array([3,2,98,199]).reshape((-1,1)) + >>> C1 = sp.spatial.distance.cdist(x, x) + >>> C2 = sp.spatial.distance.cdist(y, y) + >>> np.round(entropic_partial_gromov_wasserstein2(C1, C2, a, b,50), 2) + 1.87 + + References + ---------- + .. [12] Peyré, Gabriel, Marco Cuturi, and Justin Solomon, + "Gromov-Wasserstein averaging of kernel and distance matrices." + International Conference on Machine Learning (ICML). 2016. + .. [29] Chapel, L., Alaya, M., Gasso, G. (2019). "Partial Gromov- + Wasserstein with Applications on Positive-Unlabeled Learning". + arXiv preprint arXiv:2002.08276. + """ + + partial_gw, log_gw = entropic_partial_gromov_wasserstein(C1, C2, p, q, reg, + m, G0, numItermax, + tol, True, + verbose) + + log_gw['T'] = partial_gw + + if log: + return log_gw['partial_gw_dist'], log_gw + else: + return log_gw['partial_gw_dist'] @@ -78,9 +78,10 @@ def plot2D_samples_mat(xs, xt, G, thr=1e-8, **kwargs): thr : float, optional threshold above which the line is drawn **kwargs : dict - paameters given to the plot functions (default color is black if + parameters given to the plot functions (default color is black if nothing given) """ + if ('color' not in kwargs) and ('c' not in kwargs): kwargs['color'] = 'k' mx = G.max() diff --git a/ot/smooth.py b/ot/smooth.py index 5a8e4b5..81f6a3e 100644 --- a/ot/smooth.py +++ b/ot/smooth.py @@ -26,7 +26,9 @@ # Remi Flamary <remi.flamary@unice.fr> """ -Implementation of +Smooth and Sparse Optimal Transport solvers (KL an L2 reg.) + +Implementation of : Smooth and Sparse Optimal Transport. Mathieu Blondel, Vivien Seguy, Antoine Rolet. In Proc. of AISTATS 2018. diff --git a/ot/unbalanced.py b/ot/unbalanced.py index d516dfc..e37f10c 100644 --- a/ot/unbalanced.py +++ b/ot/unbalanced.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Regularized Unbalanced OT +Regularized Unbalanced OT solvers """ # Author: Hicham Janati <hicham.janati@inria.fr> @@ -384,10 +384,9 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000, fi = reg_m / (reg_m + reg) - cpt = 0 err = 1. - while (err > stopThr and cpt < numItermax): + for i in range(numItermax): uprev = u vprev = v @@ -401,28 +400,27 @@ def sinkhorn_knopp_unbalanced(a, b, M, reg, reg_m, numItermax=1000, or np.any(np.isinf(u)) or np.any(np.isinf(v))): # we have reached the machine precision # come back to previous solution and quit loop - warnings.warn('Numerical errors at iteration %s' % cpt) + warnings.warn('Numerical errors at iteration %s' % i) u = uprev v = vprev break - if cpt % 10 == 0: - # we can speed up the process by checking for the error only all - # the 10th iterations - err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.) - err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.) - err = 0.5 * (err_u + err_v) - if log: - log['err'].append(err) + + err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.) + err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.) + err = 0.5 * (err_u + err_v) + if log: + log['err'].append(err) if verbose: - if cpt % 200 == 0: + if i % 50 == 0: print( '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) - print('{:5d}|{:8e}|'.format(cpt, err)) - cpt += 1 + print('{:5d}|{:8e}|'.format(i, err)) + if err < stopThr: + break if log: - log['logu'] = np.log(u + 1e-16) - log['logv'] = np.log(v + 1e-16) + log['logu'] = np.log(u + 1e-300) + log['logv'] = np.log(v + 1e-300) if n_hists: # return only loss res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) @@ -747,8 +745,8 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, alpha = np.zeros(dim) beta = np.zeros(dim) q = np.ones(dim) / dim - while (err > stopThr and cpt < numItermax): - qprev = q + for i in range(numItermax): + qprev = q.copy() Kv = K.dot(v) f_alpha = np.exp(- alpha / (reg + reg_m)) f_beta = np.exp(- beta / (reg + reg_m)) @@ -777,7 +775,7 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, warnings.warn('Numerical errors at iteration %s' % cpt) q = qprev break - if (cpt % 10 == 0 and not absorbing) or cpt == 0: + if (i % 10 == 0 and not absorbing) or i == 0: # we can speed up the process by checking for the error only all # the 10th iterations err = abs(q - qprev).max() / max(abs(q).max(), @@ -785,20 +783,21 @@ def barycenter_unbalanced_stabilized(A, M, reg, reg_m, weights=None, tau=1e3, if log: log['err'].append(err) if verbose: - if cpt % 50 == 0: + if i % 50 == 0: print( '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) - print('{:5d}|{:8e}|'.format(cpt, err)) + print('{:5d}|{:8e}|'.format(i, err)) + if err < stopThr: + break - cpt += 1 if err > stopThr: warnings.warn("Stabilized Unbalanced Sinkhorn did not converge." + "Try a larger entropy `reg` or a lower mass `reg_m`." + "Or a larger absorption threshold `tau`.") if log: - log['niter'] = cpt - log['logu'] = np.log(u + 1e-16) - log['logv'] = np.log(v + 1e-16) + log['niter'] = i + log['logu'] = np.log(u + 1e-300) + log['logv'] = np.log(v + 1e-300) return q, log else: return q @@ -882,15 +881,15 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None, fi = reg_m / (reg_m + reg) - v = np.ones((dim, n_hists)) / dim - u = np.ones((dim, 1)) / dim - - cpt = 0 + v = np.ones((dim, n_hists)) + u = np.ones((dim, 1)) + q = np.ones(dim) err = 1. - while (err > stopThr and cpt < numItermax): - uprev = u - vprev = v + for i in range(numItermax): + uprev = u.copy() + vprev = v.copy() + qprev = q.copy() Kv = K.dot(v) u = (A / Kv) ** fi @@ -905,31 +904,30 @@ def barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, weights=None, or np.any(np.isinf(u)) or np.any(np.isinf(v))): # we have reached the machine precision # come back to previous solution and quit loop - warnings.warn('Numerical errors at iteration %s' % cpt) + warnings.warn('Numerical errors at iteration %s' % i) u = uprev v = vprev + q = qprev break - if cpt % 10 == 0: - # we can speed up the process by checking for the error only all - # the 10th iterations - err_u = abs(u - uprev).max() - err_u /= max(abs(u).max(), abs(uprev).max(), 1.) - err_v = abs(v - vprev).max() - err_v /= max(abs(v).max(), abs(vprev).max(), 1.) - err = 0.5 * (err_u + err_v) - if log: - log['err'].append(err) - if verbose: - if cpt % 50 == 0: - print( - '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) - print('{:5d}|{:8e}|'.format(cpt, err)) + # compute change in barycenter + err = abs(q - qprev).max() + err /= max(abs(q).max(), abs(qprev).max(), 1.) + if log: + log['err'].append(err) + # if barycenter did not change + at least 10 iterations - stop + if err < stopThr and i > 10: + break + + if verbose: + if i % 10 == 0: + print( + '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(i, err)) - cpt += 1 if log: - log['niter'] = cpt - log['logu'] = np.log(u + 1e-16) - log['logv'] = np.log(v + 1e-16) + log['niter'] = i + log['logu'] = np.log(u + 1e-300) + log['logv'] = np.log(v + 1e-300) return q, log else: return q @@ -1002,12 +1000,14 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None, if method.lower() == 'sinkhorn': return barycenter_unbalanced_sinkhorn(A, M, reg, reg_m, + weights=weights, numItermax=numItermax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) elif method.lower() == 'sinkhorn_stabilized': return barycenter_unbalanced_stabilized(A, M, reg, reg_m, + weights=weights, numItermax=numItermax, stopThr=stopThr, verbose=verbose, @@ -1015,6 +1015,7 @@ def barycenter_unbalanced(A, M, reg, reg_m, method="sinkhorn", weights=None, elif method.lower() in ['sinkhorn_reg_scaling']: warnings.warn('Method not implemented yet. Using classic Sinkhorn Knopp') return barycenter_unbalanced(A, M, reg, reg_m, + weights=weights, numItermax=numItermax, stopThr=stopThr, verbose=verbose, log=log, **kwargs) diff --git a/ot/utils.py b/ot/utils.py index b71458b..f9911a1 100644 --- a/ot/utils.py +++ b/ot/utils.py @@ -49,6 +49,12 @@ def kernel(x1, x2, method='gaussian', sigma=1, **kwargs): return K +def laplacian(x): + """Compute Laplacian matrix""" + L = np.diag(np.sum(x, axis=0)) - x + return L + + def unif(n): """ return a uniform histogram of length n (simplex) @@ -200,6 +206,28 @@ def dots(*args): return reduce(np.dot, args) +def label_normalization(y, start=0): + """ Transform labels to start at a given value + + Parameters + ---------- + y : array-like, shape (n, ) + The vector of labels to be normalized. + start : int + Desired value for the smallest label in y (default=0) + + Returns + ------- + y : array-like, shape (n1, ) + The input vector of labels normalized according to given start value. + """ + + diff = np.min(np.unique(y)) - start + if diff != 0: + y -= diff + return y + + def fun(f, q_in, q_out): """ Utility function for parmap with no serializing problems """ while True: |