summaryrefslogtreecommitdiff
path: root/ot/gromov.py
diff options
context:
space:
mode:
Diffstat (limited to 'ot/gromov.py')
-rw-r--r--ot/gromov.py2835
1 files changed, 0 insertions, 2835 deletions
diff --git a/ot/gromov.py b/ot/gromov.py
deleted file mode 100644
index 55ab0bd..0000000
--- a/ot/gromov.py
+++ /dev/null
@@ -1,2835 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Gromov-Wasserstein and Fused-Gromov-Wasserstein solvers
-"""
-
-# Author: Erwan Vautier <erwan.vautier@gmail.com>
-# Nicolas Courty <ncourty@irisa.fr>
-# Rémi Flamary <remi.flamary@unice.fr>
-# Titouan Vayer <titouan.vayer@irisa.fr>
-# Cédric Vincent-Cuaz <cedric.vincent-cuaz@inria.fr>
-#
-# License: MIT License
-
-import numpy as np
-
-
-from .bregman import sinkhorn
-from .utils import dist, UndefinedParameter, list_to_array
-from .optim import cg
-from .lp import emd_1d, emd
-from .utils import check_random_state, unif
-from .backend import get_backend
-
-
-def init_matrix(C1, C2, p, q, loss_fun='square_loss'):
- r"""Return loss matrices and tensors for Gromov-Wasserstein fast computation
-
- Returns the value of :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` with the
- selected loss function as the loss function of Gromow-Wasserstein discrepancy.
-
- The matrices are computed as described in Proposition 1 in :ref:`[12] <references-init-matrix>`
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{T}`: A coupling between those two spaces
-
- The square-loss function :math:`L(a, b) = |a - b|^2` is read as :
-
- .. math::
-
- L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
-
- \mathrm{with} \ f_1(a) &= a^2
-
- f_2(b) &= b^2
-
- h_1(a) &= a
-
- h_2(b) &= 2b
-
- The kl-loss function :math:`L(a, b) = a \log\left(\frac{a}{b}\right) - a + b` is read as :
-
- .. math::
-
- L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)
-
- \mathrm{with} \ f_1(a) &= a \log(a) - a
-
- f_2(b) &= b
-
- h_1(a) &= a
-
- h_2(b) &= \log(b)
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- T : array-like, shape (ns, nt)
- Coupling between source and target spaces
- p : array-like, shape (ns,)
-
- Returns
- -------
- constC : array-like, shape (ns, nt)
- Constant :math:`\mathbf{C}` matrix in Eq. (6)
- hC1 : array-like, shape (ns, ns)
- :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
- hC2 : array-like, shape (nt, nt)
- :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
-
-
- .. _references-init-matrix:
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- C1, C2, p, q = list_to_array(C1, C2, p, q)
- nx = get_backend(C1, C2, p, q)
-
- if loss_fun == 'square_loss':
- def f1(a):
- return (a**2)
-
- def f2(b):
- return (b**2)
-
- def h1(a):
- return a
-
- def h2(b):
- return 2 * b
- elif loss_fun == 'kl_loss':
- def f1(a):
- return a * nx.log(a + 1e-15) - a
-
- def f2(b):
- return b
-
- def h1(a):
- return a
-
- def h2(b):
- return nx.log(b + 1e-15)
-
- constC1 = nx.dot(
- nx.dot(f1(C1), nx.reshape(p, (-1, 1))),
- nx.ones((1, len(q)), type_as=q)
- )
- constC2 = nx.dot(
- nx.ones((len(p), 1), type_as=p),
- nx.dot(nx.reshape(q, (1, -1)), f2(C2).T)
- )
- constC = constC1 + constC2
- hC1 = h1(C1)
- hC2 = h2(C2)
-
- return constC, hC1, hC2
-
-
-def tensor_product(constC, hC1, hC2, T):
- r"""Return the tensor for Gromov-Wasserstein fast computation
-
- The tensor is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-tensor-product>`
-
- Parameters
- ----------
- constC : array-like, shape (ns, nt)
- Constant :math:`\mathbf{C}` matrix in Eq. (6)
- hC1 : array-like, shape (ns, ns)
- :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
- hC2 : array-like, shape (nt, nt)
- :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
-
- Returns
- -------
- tens : array-like, shape (`ns`, `nt`)
- :math:`\mathcal{L}(\mathbf{C_1}, \mathbf{C_2}) \otimes \mathbf{T}` tensor-matrix multiplication result
-
-
- .. _references-tensor-product:
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- constC, hC1, hC2, T = list_to_array(constC, hC1, hC2, T)
- nx = get_backend(constC, hC1, hC2, T)
-
- A = - nx.dot(
- nx.dot(hC1, T), hC2.T
- )
- tens = constC + A
- # tens -= tens.min()
- return tens
-
-
-def gwloss(constC, hC1, hC2, T):
- r"""Return the Loss for Gromov-Wasserstein
-
- The loss is computed as described in Proposition 1 Eq. (6) in :ref:`[12] <references-gwloss>`
-
- Parameters
- ----------
- constC : array-like, shape (ns, nt)
- Constant :math:`\mathbf{C}` matrix in Eq. (6)
- hC1 : array-like, shape (ns, ns)
- :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
- hC2 : array-like, shape (nt, nt)
- :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
- T : array-like, shape (ns, nt)
- Current value of transport matrix :math:`\mathbf{T}`
-
- Returns
- -------
- loss : float
- Gromov Wasserstein loss
-
-
- .. _references-gwloss:
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
-
- tens = tensor_product(constC, hC1, hC2, T)
-
- tens, T = list_to_array(tens, T)
- nx = get_backend(tens, T)
-
- return nx.sum(tens * T)
-
-
-def gwggrad(constC, hC1, hC2, T):
- r"""Return the gradient for Gromov-Wasserstein
-
- The gradient is computed as described in Proposition 2 in :ref:`[12] <references-gwggrad>`
-
- Parameters
- ----------
- constC : array-like, shape (ns, nt)
- Constant :math:`\mathbf{C}` matrix in Eq. (6)
- hC1 : array-like, shape (ns, ns)
- :math:`\mathbf{h1}(\mathbf{C1})` matrix in Eq. (6)
- hC2 : array-like, shape (nt, nt)
- :math:`\mathbf{h2}(\mathbf{C2})` matrix in Eq. (6)
- T : array-like, shape (ns, nt)
- Current value of transport matrix :math:`\mathbf{T}`
-
- Returns
- -------
- grad : array-like, shape (`ns`, `nt`)
- Gromov Wasserstein gradient
-
-
- .. _references-gwggrad:
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- return 2 * tensor_product(constC, hC1, hC2,
- T) # [12] Prop. 2 misses a 2 factor
-
-
-def update_square_loss(p, lambdas, T, Cs):
- r"""
- Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s`
- couplings calculated at each iteration
-
- Parameters
- ----------
- p : array-like, shape (N,)
- Masses in the targeted barycenter.
- lambdas : list of float
- List of the `S` spaces' weights.
- T : list of S array-like of shape (ns,N)
- The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
- Cs : list of S array-like, shape(ns,ns)
- Metric cost matrices.
-
- Returns
- ----------
- C : array-like, shape (`nt`, `nt`)
- Updated :math:`\mathbf{C}` matrix.
- """
- T = list_to_array(*T)
- Cs = list_to_array(*Cs)
- p = list_to_array(p)
- nx = get_backend(p, *T, *Cs)
-
- tmpsum = sum([
- lambdas[s] * nx.dot(
- nx.dot(T[s].T, Cs[s]),
- T[s]
- ) for s in range(len(T))
- ])
- ppt = nx.outer(p, p)
-
- return tmpsum / ppt
-
-
-def update_kl_loss(p, lambdas, T, Cs):
- r"""
- Updates :math:`\mathbf{C}` according to the KL Loss kernel with the `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
-
-
- Parameters
- ----------
- p : array-like, shape (N,)
- Weights in the targeted barycenter.
- lambdas : list of float
- List of the `S` spaces' weights
- T : list of S array-like of shape (ns,N)
- The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
- Cs : list of S array-like, shape(ns,ns)
- Metric cost matrices.
-
- Returns
- ----------
- C : array-like, shape (`ns`, `ns`)
- updated :math:`\mathbf{C}` matrix
- """
- Cs = list_to_array(*Cs)
- T = list_to_array(*T)
- p = list_to_array(p)
- nx = get_backend(p, *T, *Cs)
-
- tmpsum = sum([
- lambdas[s] * nx.dot(
- nx.dot(T[s].T, Cs[s]),
- T[s]
- ) for s in range(len(T))
- ])
- ppt = nx.outer(p, p)
-
- return nx.exp(tmpsum / ppt)
-
-
-def gromov_wasserstein(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, G0=None, **kwargs):
- r"""
- Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-
- The function solves the following optimization problem:
-
- .. math::
- \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
- L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity matrices
-
- .. note:: This function is backend-compatible and will work on arrays
- from all compatible backends. But the algorithm uses the C++ CPU backend
- which can lead to copy overhead on GPU arrays.
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : str
- loss function used for the solver either 'square_loss' or 'kl_loss'
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- record log if True
- armijo : bool, optional
- If True the step of the line-search is found via an armijo research. Else closed form is used.
- If there are convergence issues use False.
- G0: array-like, shape (ns,nt), optional
- If None the initial transport plan of the solver is pq^T.
- Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver.
- **kwargs : dict
- parameters can be directly passed to the ot.optim.cg solver
-
- Returns
- -------
- T : array-like, shape (`ns`, `nt`)
- Coupling between the two spaces that minimizes:
-
- :math:`\sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}`
- log : dict
- Convergence information and loss.
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
- metric approach to object matching. Foundations of computational
- mathematics 11.4 (2011): 417-487.
-
- """
- p, q = list_to_array(p, q)
- p0, q0, C10, C20 = p, q, C1, C2
- if G0 is None:
- nx = get_backend(p0, q0, C10, C20)
- else:
- G0_ = G0
- nx = get_backend(p0, q0, C10, C20, G0_)
- p = nx.to_numpy(p)
- q = nx.to_numpy(q)
- C1 = nx.to_numpy(C10)
- C2 = nx.to_numpy(C20)
-
- if G0 is None:
- G0 = p[:, None] * q[None, :]
- else:
- G0 = nx.to_numpy(G0_)
- # Check marginals of G0
- np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
- np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- def f(G):
- return gwloss(constC, hC1, hC2, G)
-
- def df(G):
- return gwggrad(constC, hC1, hC2, G)
-
- if log:
- res, log = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
- log['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, res), type_as=C10)
- log['u'] = nx.from_numpy(log['u'], type_as=C10)
- log['v'] = nx.from_numpy(log['v'], type_as=C10)
- return nx.from_numpy(res, type_as=C10), log
- else:
- return nx.from_numpy(cg(p, q, 0, 1, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=False, **kwargs), type_as=C10)
-
-
-def gromov_wasserstein2(C1, C2, p, q, loss_fun='square_loss', log=False, armijo=False, G0=None, **kwargs):
- r"""
- Returns the gromov-wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-
- The function solves the following optimization problem:
-
- .. math::
- GW = \min_\mathbf{T} \quad \sum_{i,j,k,l}
- L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity
- matrices
-
- Note that when using backends, this loss function is differentiable wrt the
- marices and weights for quadratic loss using the gradients from [38]_.
-
- .. note:: This function is backend-compatible and will work on arrays
- from all compatible backends. But the algorithm uses the C++ CPU backend
- which can lead to copy overhead on GPU arrays.
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space.
- q : array-like, shape (nt,)
- Distribution in the target space.
- loss_fun : str
- loss function used for the solver either 'square_loss' or 'kl_loss'
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- record log if True
- armijo : bool, optional
- If True the step of the line-search is found via an armijo research. Else closed form is used.
- If there are convergence issues use False.
- G0: array-like, shape (ns,nt), optional
- If None the initial transport plan of the solver is pq^T.
- Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver.
-
- Returns
- -------
- gw_dist : float
- Gromov-Wasserstein distance
- log : dict
- convergence information and Coupling marix
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- .. [13] Mémoli, Facundo. Gromov–Wasserstein distances and the
- metric approach to object matching. Foundations of computational
- mathematics 11.4 (2011): 417-487.
-
- .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online
- Graph Dictionary Learning, International Conference on Machine Learning
- (ICML), 2021.
-
- """
- p, q = list_to_array(p, q)
- p0, q0, C10, C20 = p, q, C1, C2
- if G0 is None:
- nx = get_backend(p0, q0, C10, C20)
- else:
- G0_ = G0
- nx = get_backend(p0, q0, C10, C20, G0_)
-
- p = nx.to_numpy(p)
- q = nx.to_numpy(q)
- C1 = nx.to_numpy(C10)
- C2 = nx.to_numpy(C20)
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- if G0 is None:
- G0 = p[:, None] * q[None, :]
- else:
- G0 = nx.to_numpy(G0_)
- # Check marginals of G0
- np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
- np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
-
- def f(G):
- return gwloss(constC, hC1, hC2, G)
-
- def df(G):
- return gwggrad(constC, hC1, hC2, G)
-
- T, log_gw = cg(p, q, 0, 1, f, df, G0, log=True, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs)
-
- T0 = nx.from_numpy(T, type_as=C10)
-
- log_gw['gw_dist'] = nx.from_numpy(gwloss(constC, hC1, hC2, T), type_as=C10)
- log_gw['u'] = nx.from_numpy(log_gw['u'], type_as=C10)
- log_gw['v'] = nx.from_numpy(log_gw['v'], type_as=C10)
- log_gw['T'] = T0
-
- gw = log_gw['gw_dist']
-
- if loss_fun == 'square_loss':
- gC1 = 2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T)
- gC2 = 2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T)
- gC1 = nx.from_numpy(gC1, type_as=C10)
- gC2 = nx.from_numpy(gC2, type_as=C10)
- gw = nx.set_gradients(gw, (p0, q0, C10, C20),
- (log_gw['u'] - nx.mean(log_gw['u']),
- log_gw['v'] - nx.mean(log_gw['v']), gC1, gC2))
-
- if log:
- return gw, log_gw
- else:
- return gw
-
-
-def fused_gromov_wasserstein(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, G0=None, log=False, **kwargs):
- r"""
- Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
-
- .. math::
- \gamma = \mathop{\arg \min}_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F +
- \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
-
- \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
-
- \mathbf{\gamma} &\geq 0
-
- where :
-
- - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
- - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
- - `L` is a loss function to account for the misfit between the similarity matrices
-
- .. note:: This function is backend-compatible and will work on arrays
- from all compatible backends. But the algorithm uses the C++ CPU backend
- which can lead to copy overhead on GPU arrays.
-
- The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>`
-
- Parameters
- ----------
- M : array-like, shape (ns, nt)
- Metric cost matrix between features across domains
- C1 : array-like, shape (ns, ns)
- Metric cost matrix representative of the structure in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix representative of the structure in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : str, optional
- Loss function used for the solver
- alpha : float, optional
- Trade-off parameter (0 < alpha < 1)
- armijo : bool, optional
- If True the step of the line-search is found via an armijo research. Else closed form is used.
- If there are convergence issues use False.
- G0: array-like, shape (ns,nt), optional
- If None the initial transport plan of the solver is pq^T.
- Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver.
- log : bool, optional
- record log if True
- **kwargs : dict
- parameters can be directly passed to the ot.optim.cg solver
-
- Returns
- -------
- gamma : array-like, shape (`ns`, `nt`)
- Optimal transportation matrix for the given parameters.
- log : dict
- Log dictionary return only if log==True in parameters.
-
-
- .. _references-fused-gromov-wasserstein:
- References
- ----------
- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
- and Courty Nicolas "Optimal Transport for structured data with
- application on graphs", International Conference on Machine Learning
- (ICML). 2019.
- """
- p, q = list_to_array(p, q)
- p0, q0, C10, C20, M0 = p, q, C1, C2, M
- if G0 is None:
- nx = get_backend(p0, q0, C10, C20, M0)
- else:
- G0_ = G0
- nx = get_backend(p0, q0, C10, C20, M0, G0_)
-
- p = nx.to_numpy(p)
- q = nx.to_numpy(q)
- C1 = nx.to_numpy(C10)
- C2 = nx.to_numpy(C20)
- M = nx.to_numpy(M0)
- if G0 is None:
- G0 = p[:, None] * q[None, :]
- else:
- G0 = nx.to_numpy(G0_)
- # Check marginals of G0
- np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
- np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- def f(G):
- return gwloss(constC, hC1, hC2, G)
-
- def df(G):
- return gwggrad(constC, hC1, hC2, G)
-
- if log:
- res, log = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
- fgw_dist = nx.from_numpy(log['loss'][-1], type_as=C10)
- log['fgw_dist'] = fgw_dist
- log['u'] = nx.from_numpy(log['u'], type_as=C10)
- log['v'] = nx.from_numpy(log['v'], type_as=C10)
- return nx.from_numpy(res, type_as=C10), log
- else:
- return nx.from_numpy(cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, **kwargs), type_as=C10)
-
-
-def fused_gromov_wasserstein2(M, C1, C2, p, q, loss_fun='square_loss', alpha=0.5, armijo=False, G0=None, log=False, **kwargs):
- r"""
- Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
-
- .. math::
- \min_\gamma \quad (1 - \alpha) \langle \gamma, \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l}
- L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- s.t. \ \mathbf{\gamma} \mathbf{1} &= \mathbf{p}
-
- \mathbf{\gamma}^T \mathbf{1} &= \mathbf{q}
-
- \mathbf{\gamma} &\geq 0
-
- where :
-
- - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
- - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
- - `L` is a loss function to account for the misfit between the similarity matrices
-
- The algorithm used for solving the problem is conditional gradient as
- discussed in :ref:`[24] <references-fused-gromov-wasserstein2>`
-
- .. note:: This function is backend-compatible and will work on arrays
- from all compatible backends. But the algorithm uses the C++ CPU backend
- which can lead to copy overhead on GPU arrays.
-
- Note that when using backends, this loss function is differentiable wrt the
- marices and weights for quadratic loss using the gradients from [38]_.
-
- Parameters
- ----------
- M : array-like, shape (ns, nt)
- Metric cost matrix between features across domains
- C1 : array-like, shape (ns, ns)
- Metric cost matrix representative of the structure in the source space.
- C2 : array-like, shape (nt, nt)
- Metric cost matrix representative of the structure in the target space.
- p : array-like, shape (ns,)
- Distribution in the source space.
- q : array-like, shape (nt,)
- Distribution in the target space.
- loss_fun : str, optional
- Loss function used for the solver.
- alpha : float, optional
- Trade-off parameter (0 < alpha < 1)
- armijo : bool, optional
- If True the step of the line-search is found via an armijo research.
- Else closed form is used. If there are convergence issues use False.
- G0: array-like, shape (ns,nt), optional
- If None the initial transport plan of the solver is pq^T.
- Otherwise G0 must satisfy marginal constraints and will be used as initial transport of the solver.
- log : bool, optional
- Record log if True.
- **kwargs : dict
- Parameters can be directly passed to the ot.optim.cg solver.
-
- Returns
- -------
- fgw-distance : float
- Fused gromov wasserstein distance for the given parameters.
- log : dict
- Log dictionary return only if log==True in parameters.
-
-
- .. _references-fused-gromov-wasserstein2:
- References
- ----------
- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
- and Courty Nicolas
- "Optimal Transport for structured data with application on graphs"
- International Conference on Machine Learning (ICML). 2019.
-
- .. [38] C. Vincent-Cuaz, T. Vayer, R. Flamary, M. Corneli, N. Courty, Online
- Graph Dictionary Learning, International Conference on Machine Learning
- (ICML), 2021.
- """
- p, q = list_to_array(p, q)
-
- p0, q0, C10, C20, M0 = p, q, C1, C2, M
- if G0 is None:
- nx = get_backend(p0, q0, C10, C20, M0)
- else:
- G0_ = G0
- nx = get_backend(p0, q0, C10, C20, M0, G0_)
-
- p = nx.to_numpy(p)
- q = nx.to_numpy(q)
- C1 = nx.to_numpy(C10)
- C2 = nx.to_numpy(C20)
- M = nx.to_numpy(M0)
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- if G0 is None:
- G0 = p[:, None] * q[None, :]
- else:
- G0 = nx.to_numpy(G0_)
- # Check marginals of G0
- np.testing.assert_allclose(G0.sum(axis=1), p, atol=1e-08)
- np.testing.assert_allclose(G0.sum(axis=0), q, atol=1e-08)
-
- def f(G):
- return gwloss(constC, hC1, hC2, G)
-
- def df(G):
- return gwggrad(constC, hC1, hC2, G)
-
- T, log_fgw = cg(p, q, (1 - alpha) * M, alpha, f, df, G0, armijo=armijo, C1=C1, C2=C2, constC=constC, log=True, **kwargs)
-
- fgw_dist = nx.from_numpy(log_fgw['loss'][-1], type_as=C10)
-
- T0 = nx.from_numpy(T, type_as=C10)
-
- log_fgw['fgw_dist'] = fgw_dist
- log_fgw['u'] = nx.from_numpy(log_fgw['u'], type_as=C10)
- log_fgw['v'] = nx.from_numpy(log_fgw['v'], type_as=C10)
- log_fgw['T'] = T0
-
- if loss_fun == 'square_loss':
- gC1 = 2 * C1 * (p[:, None] * p[None, :]) - 2 * T.dot(C2).dot(T.T)
- gC2 = 2 * C2 * (q[:, None] * q[None, :]) - 2 * T.T.dot(C1).dot(T)
- gC1 = nx.from_numpy(gC1, type_as=C10)
- gC2 = nx.from_numpy(gC2, type_as=C10)
- fgw_dist = nx.set_gradients(fgw_dist, (p0, q0, C10, C20, M0),
- (log_fgw['u'] - nx.mean(log_fgw['u']),
- log_fgw['v'] - nx.mean(log_fgw['v']),
- alpha * gC1, alpha * gC2, (1 - alpha) * T0))
-
- if log:
- return fgw_dist, log_fgw
- else:
- return fgw_dist
-
-
-def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
- nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
- r"""
- Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
- with a fixed transport plan :math:`\mathbf{T}`.
-
- The function gives an unbiased approximation of the following equation:
-
- .. math::
-
- GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - `L` : Loss function to account for the misfit between the similarity matrices
- - :math:`\mathbf{T}`: Matrix with marginal :math:`\mathbf{p}` and :math:`\mathbf{q}`
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
- Loss function used for the distance, the transport plan does not depend on the loss function
- T : csr or array-like, shape (ns, nt)
- Transport plan matrix, either a sparse csr or a dense matrix
- nb_samples_p : int, optional
- `nb_samples_p` is the number of samples (without replacement) along the first dimension of :math:`\mathbf{T}`
- nb_samples_q : int, optional
- `nb_samples_q` is the number of samples along the second dimension of :math:`\mathbf{T}`, for each sample along the first
- std : bool, optional
- Standard deviation associated with the prediction of the gromov-wasserstein cost
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- : float
- Gromov-wasserstein cost
-
- References
- ----------
- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
- "Sampled Gromov Wasserstein."
- Machine Learning Journal (MLJ). 2021.
-
- """
- C1, C2, p, q = list_to_array(C1, C2, p, q)
- nx = get_backend(C1, C2, p, q)
-
- generator = check_random_state(random_state)
-
- len_p = p.shape[0]
- len_q = q.shape[0]
-
- # It is always better to sample from the biggest distribution first.
- if len_p < len_q:
- p, q = q, p
- len_p, len_q = len_q, len_p
- C1, C2 = C2, C1
- T = T.T
-
- if nb_samples_p is None:
- if nx.issparse(T):
- # If T is sparse, it probably mean that PoGroW was used, thus the number of sample is reduced
- nb_samples_p = min(int(5 * (len_p * np.log(len_p)) ** 0.5), len_p)
- else:
- nb_samples_p = len_p
- else:
- # The number of sample along the first dimension is without replacement.
- nb_samples_p = min(nb_samples_p, len_p)
- if nb_samples_q is None:
- nb_samples_q = 1
- if std:
- nb_samples_q = max(2, nb_samples_q)
-
- index_k = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
- index_l = np.zeros((nb_samples_p, nb_samples_q), dtype=int)
-
- index_i = generator.choice(
- len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False
- )
- index_j = generator.choice(
- len_p, size=nb_samples_p, p=nx.to_numpy(p), replace=False
- )
-
- for i in range(nb_samples_p):
- if nx.issparse(T):
- T_indexi = nx.reshape(nx.todense(T[index_i[i], :]), (-1,))
- T_indexj = nx.reshape(nx.todense(T[index_j[i], :]), (-1,))
- else:
- T_indexi = T[index_i[i], :]
- T_indexj = T[index_j[i], :]
- # For each of the row sampled, the column is sampled.
- index_k[i] = generator.choice(
- len_q,
- size=nb_samples_q,
- p=nx.to_numpy(T_indexi / nx.sum(T_indexi)),
- replace=True
- )
- index_l[i] = generator.choice(
- len_q,
- size=nb_samples_q,
- p=nx.to_numpy(T_indexj / nx.sum(T_indexj)),
- replace=True
- )
-
- list_value_sample = nx.stack([
- loss_fun(
- C1[np.ix_(index_i, index_j)],
- C2[np.ix_(index_k[:, n], index_l[:, n])]
- ) for n in range(nb_samples_q)
- ], axis=2)
-
- if std:
- std_value = nx.sum(nx.std(list_value_sample, axis=2) ** 2) ** 0.5
- return nx.mean(list_value_sample), std_value / (nb_samples_p * nb_samples_p)
- else:
- return nx.mean(list_value_sample)
-
-
-def pointwise_gromov_wasserstein(C1, C2, p, q, loss_fun,
- alpha=1, max_iter=100, threshold_plan=0, log=False, verbose=False, random_state=None):
- r"""
- Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a stochastic Frank-Wolfe.
- This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times PN^2)` time complexity with `P` the number of Sinkhorn iterations.
-
- The function solves the following optimization problem:
-
- .. math::
- \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
- L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
-
- \mathbf{T}^T \mathbf{1} &= \mathbf{q}
-
- \mathbf{T} &\geq 0
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity matrices
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
- Loss function used for the distance, the transport plan does not depend on the loss function
- alpha : float
- Step of the Frank-Wolfe algorithm, should be between 0 and 1
- max_iter : int, optional
- Max number of iterations
- threshold_plan : float, optional
- Deleting very small values in the transport plan. If above zero, it violates the marginal constraints.
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- Gives the distance estimated and the standard deviation
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- T : array-like, shape (`ns`, `nt`)
- Optimal coupling between the two spaces
-
- References
- ----------
- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
- "Sampled Gromov Wasserstein."
- Machine Learning Journal (MLJ). 2021.
-
- """
- C1, C2, p, q = list_to_array(C1, C2, p, q)
- nx = get_backend(C1, C2, p, q)
-
- len_p = p.shape[0]
- len_q = q.shape[0]
-
- generator = check_random_state(random_state)
-
- index = np.zeros(2, dtype=int)
-
- # Initialize with default marginal
- index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p))
- index[1] = generator.choice(len_q, size=1, p=nx.to_numpy(q))
- T = nx.tocsr(emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False))
-
- best_gw_dist_estimated = np.inf
- for cpt in range(max_iter):
- index[0] = generator.choice(len_p, size=1, p=nx.to_numpy(p))
- T_index0 = nx.reshape(nx.todense(T[index[0], :]), (-1,))
- index[1] = generator.choice(
- len_q, size=1, p=nx.to_numpy(T_index0 / nx.sum(T_index0))
- )
-
- if alpha == 1:
- T = nx.tocsr(
- emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
- )
- else:
- new_T = nx.tocsr(
- emd_1d(C1[index[0]], C2[index[1]], a=p, b=q, dense=False)
- )
- T = (1 - alpha) * T + alpha * new_T
- # To limit the number of non 0, the values below the threshold are set to 0.
- T = nx.eliminate_zeros(T, threshold=threshold_plan)
-
- if cpt % 10 == 0 or cpt == (max_iter - 1):
- gw_dist_estimated = GW_distance_estimation(
- C1=C1, C2=C2, loss_fun=loss_fun,
- p=p, q=q, T=T, std=False, random_state=generator
- )
-
- if gw_dist_estimated < best_gw_dist_estimated:
- best_gw_dist_estimated = gw_dist_estimated
- best_T = nx.copy(T)
-
- if verbose:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format('It.', 'Best gw estimated') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, best_gw_dist_estimated))
-
- if log:
- log = {}
- log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
- C1=C1, C2=C2, loss_fun=loss_fun,
- p=p, q=q, T=best_T, random_state=generator
- )
- return best_T, log
- return best_T
-
-
-def sampled_gromov_wasserstein(C1, C2, p, q, loss_fun,
- nb_samples_grad=100, epsilon=1, max_iter=500, log=False, verbose=False,
- random_state=None):
- r"""
- Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})` using a 1-stochastic Frank-Wolfe.
- This method has a :math:`\mathcal{O}(\mathrm{max\_iter} \times N \log(N))` time complexity by relying on the 1D Optimal Transport solver.
-
- The function solves the following optimization problem:
-
- .. math::
- \mathbf{GW} = \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
- L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
- s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
-
- \mathbf{T}^T \mathbf{1} &= \mathbf{q}
-
- \mathbf{T} &\geq 0
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity matrices
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : function: :math:`\mathbb{R} \times \mathbb{R} \mapsto \mathbb{R}`
- Loss function used for the distance, the transport plan does not depend on the loss function
- nb_samples_grad : int
- Number of samples to approximate the gradient
- epsilon : float
- Weight of the Kullback-Leibler regularization
- max_iter : int, optional
- Max number of iterations
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- Gives the distance estimated and the standard deviation
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- T : array-like, shape (`ns`, `nt`)
- Optimal coupling between the two spaces
-
- References
- ----------
- .. [14] Kerdoncuff, Tanguy, Emonet, Rémi, Sebban, Marc
- "Sampled Gromov Wasserstein."
- Machine Learning Journal (MLJ). 2021.
-
- """
- C1, C2, p, q = list_to_array(C1, C2, p, q)
- nx = get_backend(C1, C2, p, q)
-
- len_p = p.shape[0]
- len_q = q.shape[0]
-
- generator = check_random_state(random_state)
-
- # The most natural way to define nb_sample is with a simple integer.
- if isinstance(nb_samples_grad, int):
- if nb_samples_grad > len_p:
- # As the sampling along the first dimension is done without replacement, the rest is reported to the second
- # dimension.
- nb_samples_grad_p, nb_samples_grad_q = len_p, nb_samples_grad // len_p
- else:
- nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad, 1
- else:
- nb_samples_grad_p, nb_samples_grad_q = nb_samples_grad
- T = nx.outer(p, q)
- # continue_loop allows to stop the loop if there is several successive small modification of T.
- continue_loop = 0
-
- # The gradient of GW is more complex if the two matrices are not symmetric.
- C_are_symmetric = nx.allclose(C1, C1.T, rtol=1e-10, atol=1e-10) and nx.allclose(C2, C2.T, rtol=1e-10, atol=1e-10)
-
- for cpt in range(max_iter):
- index0 = generator.choice(
- len_p, size=nb_samples_grad_p, p=nx.to_numpy(p), replace=False
- )
- Lik = 0
- for i, index0_i in enumerate(index0):
- index1 = generator.choice(
- len_q, size=nb_samples_grad_q,
- p=nx.to_numpy(T[index0_i, :] / nx.sum(T[index0_i, :])),
- replace=False
- )
- # If the matrices C are not symmetric, the gradient has 2 terms, thus the term is chosen randomly.
- if (not C_are_symmetric) and generator.rand(1) > 0.5:
- Lik += nx.mean(loss_fun(
- C1[:, [index0[i]] * nb_samples_grad_q][:, None, :],
- C2[:, index1][None, :, :]
- ), axis=2)
- else:
- Lik += nx.mean(loss_fun(
- C1[[index0[i]] * nb_samples_grad_q, :][:, :, None],
- C2[index1, :][:, None, :]
- ), axis=0)
-
- max_Lik = nx.max(Lik)
- if max_Lik == 0:
- continue
- # This division by the max is here to facilitate the choice of epsilon.
- Lik /= max_Lik
-
- if epsilon > 0:
- # Set to infinity all the numbers below exp(-200) to avoid log of 0.
- log_T = nx.log(nx.clip(T, np.exp(-200), 1))
- log_T = nx.where(log_T == -200, -np.inf, log_T)
- Lik = Lik - epsilon * log_T
-
- try:
- new_T = sinkhorn(a=p, b=q, M=Lik, reg=epsilon)
- except (RuntimeWarning, UserWarning):
- print("Warning catched in Sinkhorn: Return last stable T")
- break
- else:
- new_T = emd(a=p, b=q, M=Lik)
-
- change_T = nx.mean((T - new_T) ** 2)
- if change_T <= 10e-20:
- continue_loop += 1
- if continue_loop > 100: # Number max of low modifications of T
- T = nx.copy(new_T)
- break
- else:
- continue_loop = 0
-
- if verbose and cpt % 10 == 0:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format('It.', '||T_n - T_{n+1}||') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, change_T))
- T = nx.copy(new_T)
-
- if log:
- log = {}
- log["gw_dist_estimated"], log["gw_dist_std"] = GW_distance_estimation(
- C1=C1, C2=C2, loss_fun=loss_fun,
- p=p, q=q, T=T, random_state=generator
- )
- return T, log
- return T
-
-
-def entropic_gromov_wasserstein(C1, C2, p, q, loss_fun, epsilon,
- max_iter=1000, tol=1e-9, verbose=False, log=False):
- r"""
- Returns the gromov-wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-
- The function solves the following optimization problem:
-
- .. math::
- \mathbf{GW} = \mathop{\arg\min}_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
-
- s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
-
- \mathbf{T}^T \mathbf{1} &= \mathbf{q}
-
- \mathbf{T} &\geq 0
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity matrices
- - `H`: entropy
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : string
- Loss function used for the solver either 'square_loss' or 'kl_loss'
- epsilon : float
- Regularization term >0
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- Record log if True.
-
- Returns
- -------
- T : array-like, shape (`ns`, `nt`)
- Optimal coupling between the two spaces
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- C1, C2, p, q = list_to_array(C1, C2, p, q)
- nx = get_backend(C1, C2, p, q)
-
- T = nx.outer(p, q)
-
- constC, hC1, hC2 = init_matrix(C1, C2, p, q, loss_fun)
-
- cpt = 0
- err = 1
-
- if log:
- log = {'err': []}
-
- while (err > tol and cpt < max_iter):
-
- Tprev = T
-
- # compute the gradient
- tens = gwggrad(constC, hC1, hC2, T)
-
- T = sinkhorn(p, q, tens, epsilon, method='sinkhorn')
-
- if cpt % 10 == 0:
- # we can speed up the process by checking for the error only all
- # the 10th iterations
- err = nx.norm(T - Tprev)
-
- if log:
- log['err'].append(err)
-
- if verbose:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format(
- 'It.', 'Err') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, err))
-
- cpt += 1
-
- if log:
- log['gw_dist'] = gwloss(constC, hC1, hC2, T)
- return T, log
- else:
- return T
-
-
-def entropic_gromov_wasserstein2(C1, C2, p, q, loss_fun, epsilon,
- max_iter=1000, tol=1e-9, verbose=False, log=False):
- r"""
- Returns the entropic gromov-wasserstein discrepancy between the two measured similarity matrices :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-
- The function solves the following optimization problem:
-
- .. math::
- GW = \min_\mathbf{T} \quad \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l})
- \mathbf{T}_{i,j} \mathbf{T}_{k,l} - \epsilon(H(\mathbf{T}))
-
- Where :
-
- - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
- - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
- - :math:`\mathbf{p}`: distribution in the source space
- - :math:`\mathbf{q}`: distribution in the target space
- - `L`: loss function to account for the misfit between the similarity matrices
- - `H`: entropy
-
- Parameters
- ----------
- C1 : array-like, shape (ns, ns)
- Metric cost matrix in the source space
- C2 : array-like, shape (nt, nt)
- Metric cost matrix in the target space
- p : array-like, shape (ns,)
- Distribution in the source space
- q : array-like, shape (nt,)
- Distribution in the target space
- loss_fun : str
- Loss function used for the solver either 'square_loss' or 'kl_loss'
- epsilon : float
- Regularization term >0
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations
- log : bool, optional
- Record log if True.
-
- Returns
- -------
- gw_dist : float
- Gromov-Wasserstein distance
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- gw, logv = entropic_gromov_wasserstein(
- C1, C2, p, q, loss_fun, epsilon, max_iter, tol, verbose, log=True)
-
- logv['T'] = gw
-
- if log:
- return logv['gw_dist'], logv
- else:
- return logv['gw_dist']
-
-
-def entropic_gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun, epsilon,
- max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
- r"""
- Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
-
- The function solves the following optimization problem:
-
- .. math::
-
- \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
-
- Where :
-
- - :math:`\mathbf{C}_s`: metric cost matrix
- - :math:`\mathbf{p}_s`: distribution
-
- Parameters
- ----------
- N : int
- Size of the targeted barycenter
- Cs : list of S array-like of shape (ns,ns)
- Metric cost matrices
- ps : list of S array-like of shape (ns,)
- Sample weights in the `S` spaces
- p : array-like, shape(N,)
- Weights in the targeted barycenter
- lambdas : list of float
- List of the `S` spaces' weights.
- loss_fun : callable
- Tensor-matrix multiplication function based on specific loss function.
- update : callable
- function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
- :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
- calculated at each iteration
- epsilon : float
- Regularization term >0
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0)
- verbose : bool, optional
- Print information along iterations.
- log : bool, optional
- Record log if True.
- init_C : bool | array-like, shape (N, N)
- Random initial value for the :math:`\mathbf{C}` matrix provided by user.
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- C : array-like, shape (`N`, `N`)
- Similarity matrix in the barycenter space (permutated arbitrarily)
- log : dict
- Log dictionary of error during iterations. Return only if `log=True` in parameters.
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
- """
- Cs = list_to_array(*Cs)
- ps = list_to_array(*ps)
- p = list_to_array(p)
- nx = get_backend(*Cs, *ps, p)
-
- S = len(Cs)
-
- # Initialization of C : random SPD matrix (if not provided by user)
- if init_C is None:
- generator = check_random_state(random_state)
- xalea = generator.randn(N, 2)
- C = dist(xalea, xalea)
- C /= C.max()
- C = nx.from_numpy(C, type_as=p)
- else:
- C = init_C
-
- cpt = 0
- err = 1
-
- error = []
-
- while (err > tol) and (cpt < max_iter):
- Cprev = C
-
- T = [entropic_gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun, epsilon,
- max_iter, 1e-4, verbose, log=False) for s in range(S)]
- if loss_fun == 'square_loss':
- C = update_square_loss(p, lambdas, T, Cs)
-
- elif loss_fun == 'kl_loss':
- C = update_kl_loss(p, lambdas, T, Cs)
-
- if cpt % 10 == 0:
- # we can speed up the process by checking for the error only all
- # the 10th iterations
- err = nx.norm(C - Cprev)
- error.append(err)
-
- if verbose:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format(
- 'It.', 'Err') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, err))
-
- cpt += 1
-
- if log:
- return C, {"err": error}
- else:
- return C
-
-
-def gromov_barycenters(N, Cs, ps, p, lambdas, loss_fun,
- max_iter=1000, tol=1e-9, verbose=False, log=False, init_C=None, random_state=None):
- r"""
- Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
-
- The function solves the following optimization problem with block coordinate descent:
-
- .. math::
-
- \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
-
- Where :
-
- - :math:`\mathbf{C}_s`: metric cost matrix
- - :math:`\mathbf{p}_s`: distribution
-
- Parameters
- ----------
- N : int
- Size of the targeted barycenter
- Cs : list of S array-like of shape (ns, ns)
- Metric cost matrices
- ps : list of S array-like of shape (ns,)
- Sample weights in the `S` spaces
- p : array-like, shape (N,)
- Weights in the targeted barycenter
- lambdas : list of float
- List of the `S` spaces' weights
- loss_fun : callable
- tensor-matrix multiplication function based on specific loss function
- update : callable
- function(:math:`\mathbf{p}`, lambdas, :math:`\mathbf{T}`, :math:`\mathbf{Cs}`) that updates
- :math:`\mathbf{C}` according to a specific Kernel with the `S` :math:`\mathbf{T}_s` couplings
- calculated at each iteration
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0).
- verbose : bool, optional
- Print information along iterations.
- log : bool, optional
- Record log if True.
- init_C : bool | array-like, shape(N,N)
- Random initial value for the :math:`\mathbf{C}` matrix provided by user.
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- C : array-like, shape (`N`, `N`)
- Similarity matrix in the barycenter space (permutated arbitrarily)
- log : dict
- Log dictionary of error during iterations. Return only if `log=True` in parameters.
-
- References
- ----------
- .. [12] Gabriel Peyré, Marco Cuturi, and Justin Solomon,
- "Gromov-Wasserstein averaging of kernel and distance matrices."
- International Conference on Machine Learning (ICML). 2016.
-
- """
- Cs = list_to_array(*Cs)
- ps = list_to_array(*ps)
- p = list_to_array(p)
- nx = get_backend(*Cs, *ps, p)
-
- S = len(Cs)
-
- # Initialization of C : random SPD matrix (if not provided by user)
- if init_C is None:
- generator = check_random_state(random_state)
- xalea = generator.randn(N, 2)
- C = dist(xalea, xalea)
- C /= C.max()
- C = nx.from_numpy(C, type_as=p)
- else:
- C = init_C
-
- cpt = 0
- err = 1
-
- error = []
-
- while(err > tol and cpt < max_iter):
- Cprev = C
-
- T = [gromov_wasserstein(Cs[s], C, ps[s], p, loss_fun,
- numItermax=max_iter, stopThr=1e-5, verbose=verbose, log=False) for s in range(S)]
- if loss_fun == 'square_loss':
- C = update_square_loss(p, lambdas, T, Cs)
-
- elif loss_fun == 'kl_loss':
- C = update_kl_loss(p, lambdas, T, Cs)
-
- if cpt % 10 == 0:
- # we can speed up the process by checking for the error only all
- # the 10th iterations
- err = nx.norm(C - Cprev)
- error.append(err)
-
- if verbose:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format(
- 'It.', 'Err') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, err))
-
- cpt += 1
-
- if log:
- return C, {"err": error}
- else:
- return C
-
-
-def fgw_barycenters(N, Ys, Cs, ps, lambdas, alpha, fixed_structure=False, fixed_features=False,
- p=None, loss_fun='square_loss', max_iter=100, tol=1e-9,
- verbose=False, log=False, init_C=None, init_X=None, random_state=None):
- r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>`
-
- Parameters
- ----------
- N : int
- Desired number of samples of the target barycenter
- Ys: list of array-like, each element has shape (ns,d)
- Features of all samples
- Cs : list of array-like, each element has shape (ns,ns)
- Structure matrices of all samples
- ps : list of array-like, each element has shape (ns,)
- Masses of all samples.
- lambdas : list of float
- List of the `S` spaces' weights
- alpha : float
- Alpha parameter for the fgw distance
- fixed_structure : bool
- Whether to fix the structure of the barycenter during the updates
- fixed_features : bool
- Whether to fix the feature of the barycenter during the updates
- loss_fun : str
- Loss function used for the solver either 'square_loss' or 'kl_loss'
- max_iter : int, optional
- Max number of iterations
- tol : float, optional
- Stop threshold on error (>0).
- verbose : bool, optional
- Print information along iterations.
- log : bool, optional
- Record log if True.
- init_C : array-like, shape (N,N), optional
- Initialization for the barycenters' structure matrix. If not set
- a random init is used.
- init_X : array-like, shape (N,d), optional
- Initialization for the barycenters' features. If not set a
- random init is used.
- random_state : int or RandomState instance, optional
- Fix the seed for reproducibility
-
- Returns
- -------
- X : array-like, shape (`N`, `d`)
- Barycenters' features
- C : array-like, shape (`N`, `N`)
- Barycenters' structure matrix
- log : dict
- Only returned when log=True. It contains the keys:
-
- - :math:`\mathbf{T}`: list of (`N`, `ns`) transport matrices
- - :math:`(\mathbf{M}_s)_s`: all distance matrices between the feature of the barycenter and the other features :math:`(dist(\mathbf{X}, \mathbf{Y}_s))_s` shape (`N`, `ns`)
-
-
- .. _references-fgw-barycenters:
- References
- ----------
- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain
- and Courty Nicolas
- "Optimal Transport for structured data with application on graphs"
- International Conference on Machine Learning (ICML). 2019.
- """
- Cs = list_to_array(*Cs)
- ps = list_to_array(*ps)
- Ys = list_to_array(*Ys)
- p = list_to_array(p)
- nx = get_backend(*Cs, *Ys, *ps)
-
- S = len(Cs)
- d = Ys[0].shape[1] # dimension on the node features
- if p is None:
- p = nx.ones(N, type_as=Cs[0]) / N
-
- if fixed_structure:
- if init_C is None:
- raise UndefinedParameter('If C is fixed it must be initialized')
- else:
- C = init_C
- else:
- if init_C is None:
- generator = check_random_state(random_state)
- xalea = generator.randn(N, 2)
- C = dist(xalea, xalea)
- C = nx.from_numpy(C, type_as=ps[0])
- else:
- C = init_C
-
- if fixed_features:
- if init_X is None:
- raise UndefinedParameter('If X is fixed it must be initialized')
- else:
- X = init_X
- else:
- if init_X is None:
- X = nx.zeros((N, d), type_as=ps[0])
- else:
- X = init_X
-
- T = [nx.outer(p, q) for q in ps]
-
- Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
-
- cpt = 0
- err_feature = 1
- err_structure = 1
-
- if log:
- log_ = {}
- log_['err_feature'] = []
- log_['err_structure'] = []
- log_['Ts_iter'] = []
-
- while((err_feature > tol or err_structure > tol) and cpt < max_iter):
- Cprev = C
- Xprev = X
-
- if not fixed_features:
- Ys_temp = [y.T for y in Ys]
- X = update_feature_matrix(lambdas, Ys_temp, T, p).T
-
- Ms = [dist(X, Ys[s]) for s in range(len(Ys))]
-
- if not fixed_structure:
- if loss_fun == 'square_loss':
- T_temp = [t.T for t in T]
- C = update_structure_matrix(p, lambdas, T_temp, Cs)
-
- T = [fused_gromov_wasserstein(Ms[s], C, Cs[s], p, ps[s], loss_fun, alpha,
- numItermax=max_iter, stopThr=1e-5, verbose=verbose) for s in range(S)]
-
- # T is N,ns
- err_feature = nx.norm(X - nx.reshape(Xprev, (N, d)))
- err_structure = nx.norm(C - Cprev)
- if log:
- log_['err_feature'].append(err_feature)
- log_['err_structure'].append(err_structure)
- log_['Ts_iter'].append(T)
-
- if verbose:
- if cpt % 200 == 0:
- print('{:5s}|{:12s}'.format(
- 'It.', 'Err') + '\n' + '-' * 19)
- print('{:5d}|{:8e}|'.format(cpt, err_structure))
- print('{:5d}|{:8e}|'.format(cpt, err_feature))
-
- cpt += 1
-
- if log:
- log_['T'] = T # from target to Ys
- log_['p'] = p
- log_['Ms'] = Ms
-
- if log:
- return X, C, log_
- else:
- return X, C
-
-
-def update_structure_matrix(p, lambdas, T, Cs):
- r"""Updates :math:`\mathbf{C}` according to the L2 Loss kernel with the `S` :math:`\mathbf{T}_s` couplings.
-
- It is calculated at each iteration
-
- Parameters
- ----------
- p : array-like, shape (N,)
- Masses in the targeted barycenter.
- lambdas : list of float
- List of the `S` spaces' weights.
- T : list of S array-like of shape (ns, N)
- The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration.
- Cs : list of S array-like, shape (ns, ns)
- Metric cost matrices.
-
- Returns
- -------
- C : array-like, shape (`nt`, `nt`)
- Updated :math:`\mathbf{C}` matrix.
- """
- p = list_to_array(p)
- T = list_to_array(*T)
- Cs = list_to_array(*Cs)
- nx = get_backend(*Cs, *T, p)
-
- tmpsum = sum([
- lambdas[s] * nx.dot(
- nx.dot(T[s].T, Cs[s]),
- T[s]
- ) for s in range(len(T))
- ])
- ppt = nx.outer(p, p)
- return tmpsum / ppt
-
-
-def update_feature_matrix(lambdas, Ys, Ts, p):
- r"""Updates the feature with respect to the `S` :math:`\mathbf{T}_s` couplings.
-
-
- See "Solving the barycenter problem with Block Coordinate Descent (BCD)"
- in :ref:`[24] <references-update-feature-matrix>` calculated at each iteration
-
- Parameters
- ----------
- p : array-like, shape (N,)
- masses in the targeted barycenter
- lambdas : list of float
- List of the `S` spaces' weights
- Ts : list of S array-like, shape (ns,N)
- The `S` :math:`\mathbf{T}_s` couplings calculated at each iteration
- Ys : list of S array-like, shape (d,ns)
- The features.
-
- Returns
- -------
- X : array-like, shape (`d`, `N`)
-
-
- .. _references-update-feature-matrix:
- References
- ----------
- .. [24] Vayer Titouan, Chapel Laetitia, Flamary Rémi, Tavenard Romain and Courty Nicolas
- "Optimal Transport for structured data with application on graphs"
- International Conference on Machine Learning (ICML). 2019.
- """
- p = list_to_array(p)
- Ts = list_to_array(*Ts)
- Ys = list_to_array(*Ys)
- nx = get_backend(*Ys, *Ts, p)
-
- p = 1. / p
- tmpsum = sum([
- lambdas[s] * nx.dot(Ys[s], Ts[s].T) * p[None, :]
- for s in range(len(Ts))
- ])
- return tmpsum
-
-
-def gromov_wasserstein_dictionary_learning(Cs, D, nt, reg=0., ps=None, q=None, epochs=20, batch_size=32, learning_rate=1., Cdict_init=None, projection='nonnegative_symmetric', use_log=True,
- tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, use_adam_optimizer=True, verbose=False, **kwargs):
- r"""
- Infer Gromov-Wasserstein linear dictionary :math:`\{ (\mathbf{C_{dict}[d]}, q) \}_{d \in [D]}` from the list of structures :math:`\{ (\mathbf{C_s},\mathbf{p_s}) \}_s`
-
- .. math::
- \min_{\mathbf{C_{dict}}, \{\mathbf{w_s} \}_{s \leq S}} \sum_{s=1}^S GW_2(\mathbf{C_s}, \sum_{d=1}^D w_{s,d}\mathbf{C_{dict}[d]}, \mathbf{p_s}, \mathbf{q}) - reg\| \mathbf{w_s} \|_2^2
-
- such that, :math:`\forall s \leq S` :
-
- - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w_s} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\forall s \leq S, \mathbf{C_s}` is a (ns,ns) pairwise similarity matrix of variable size ns.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt.
- - :math:`\forall s \leq S, \mathbf{p_s}` is the source distribution corresponding to :math:`\mathbf{C_s}`
- - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space.
- - reg is the regularization coefficient.
-
- The stochastic algorithm used for estimating the graph dictionary atoms as proposed in [38]
-
- Parameters
- ----------
- Cs : list of S symmetric array-like, shape (ns, ns)
- List of Metric/Graph cost matrices of variable size (ns, ns).
- D: int
- Number of dictionary atoms to learn
- nt: int
- Number of samples within each dictionary atoms
- reg : float, optional
- Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0.
- ps : list of S array-like, shape (ns,), optional
- Distribution in each source space C of Cs. Default is None and corresponds to uniform distibutions.
- q : array-like, shape (nt,), optional
- Distribution in the embedding space whose structure will be learned. Default is None and corresponds to uniform distributions.
- epochs: int, optional
- Number of epochs used to learn the dictionary. Default is 32.
- batch_size: int, optional
- Batch size for each stochastic gradient update of the dictionary. Set to the dataset size if the provided batch_size is higher than the dataset size. Default is 32.
- learning_rate: float, optional
- Learning rate used for the stochastic gradient descent. Default is 1.
- Cdict_init: list of D array-like with shape (nt, nt), optional
- Used to initialize the dictionary.
- If set to None (Default), the dictionary will be initialized randomly.
- Else Cdict must have shape (D, nt, nt) i.e match provided shape features.
- projection: str , optional
- If 'nonnegative' and/or 'symmetric' is in projection, the corresponding projection will be performed at each stochastic update of the dictionary
- Else the set of atoms is :math:`R^{nt * nt}`. Default is 'nonnegative_symmetric'
- log: bool, optional
- If set to True, losses evolution by batches and epochs are tracked. Default is False.
- use_adam_optimizer: bool, optional
- If set to True, adam optimizer with default settings is used as adaptative learning rate strategy.
- Else perform SGD with fixed learning rate. Default is True.
- tol_outer : float, optional
- Solver precision for the BCD algorithm, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`.
- tol_inner : float, optional
- Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`.
- max_iter_outer : int, optional
- Maximum number of iterations for the BCD. Default is 20.
- max_iter_inner : int, optional
- Maximum number of iterations for the Conjugate Gradient. Default is 200.
- verbose : bool, optional
- Print the reconstruction loss every epoch. Default is False.
-
- Returns
- -------
-
- Cdict_best_state : D array-like, shape (D,nt,nt)
- Metric/Graph cost matrices composing the dictionary.
- The dictionary leading to the best loss over an epoch is saved and returned.
- log: dict
- If use_log is True, contains loss evolutions by batches and epochs.
- References
- -------
-
- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty.
- "Online Graph Dictionary Learning"
- International Conference on Machine Learning (ICML). 2021.
- """
- # Handle backend of non-optional arguments
- Cs0 = Cs
- nx = get_backend(*Cs0)
- Cs = [nx.to_numpy(C) for C in Cs0]
- dataset_size = len(Cs)
- # Handle backend of optional arguments
- if ps is None:
- ps = [unif(C.shape[0]) for C in Cs]
- else:
- ps = [nx.to_numpy(p) for p in ps]
- if q is None:
- q = unif(nt)
- else:
- q = nx.to_numpy(q)
- if Cdict_init is None:
- # Initialize randomly structures of dictionary atoms based on samples
- dataset_means = [C.mean() for C in Cs]
- Cdict = np.random.normal(loc=np.mean(dataset_means), scale=np.std(dataset_means), size=(D, nt, nt))
- else:
- Cdict = nx.to_numpy(Cdict_init).copy()
- assert Cdict.shape == (D, nt, nt)
-
- if 'symmetric' in projection:
- Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1)))
- if 'nonnegative' in projection:
- Cdict[Cdict < 0.] = 0
- if use_adam_optimizer:
- adam_moments = _initialize_adam_optimizer(Cdict)
-
- log = {'loss_batches': [], 'loss_epochs': []}
- const_q = q[:, None] * q[None, :]
- Cdict_best_state = Cdict.copy()
- loss_best_state = np.inf
- if batch_size > dataset_size:
- batch_size = dataset_size
- iter_by_epoch = dataset_size // batch_size + int((dataset_size % batch_size) > 0)
-
- for epoch in range(epochs):
- cumulated_loss_over_epoch = 0.
-
- for _ in range(iter_by_epoch):
- # batch sampling
- batch = np.random.choice(range(dataset_size), size=batch_size, replace=False)
- cumulated_loss_over_batch = 0.
- unmixings = np.zeros((batch_size, D))
- Cs_embedded = np.zeros((batch_size, nt, nt))
- Ts = [None] * batch_size
-
- for batch_idx, C_idx in enumerate(batch):
- # BCD solver for Gromov-Wassersteisn linear unmixing used independently on each structure of the sampled batch
- unmixings[batch_idx], Cs_embedded[batch_idx], Ts[batch_idx], current_loss = gromov_wasserstein_linear_unmixing(
- Cs[C_idx], Cdict, reg=reg, p=ps[C_idx], q=q, tol_outer=tol_outer, tol_inner=tol_inner,
- max_iter_outer=max_iter_outer, max_iter_inner=max_iter_inner
- )
- cumulated_loss_over_batch += current_loss
- cumulated_loss_over_epoch += cumulated_loss_over_batch
-
- if use_log:
- log['loss_batches'].append(cumulated_loss_over_batch)
-
- # Stochastic projected gradient step over dictionary atoms
- grad_Cdict = np.zeros_like(Cdict)
- for batch_idx, C_idx in enumerate(batch):
- shared_term_structures = Cs_embedded[batch_idx] * const_q - (Cs[C_idx].dot(Ts[batch_idx])).T.dot(Ts[batch_idx])
- grad_Cdict += unmixings[batch_idx][:, None, None] * shared_term_structures[None, :, :]
- grad_Cdict *= 2 / batch_size
- if use_adam_optimizer:
- Cdict, adam_moments = _adam_stochastic_updates(Cdict, grad_Cdict, learning_rate, adam_moments)
- else:
- Cdict -= learning_rate * grad_Cdict
- if 'symmetric' in projection:
- Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1)))
- if 'nonnegative' in projection:
- Cdict[Cdict < 0.] = 0.
-
- if use_log:
- log['loss_epochs'].append(cumulated_loss_over_epoch)
- if loss_best_state > cumulated_loss_over_epoch:
- loss_best_state = cumulated_loss_over_epoch
- Cdict_best_state = Cdict.copy()
- if verbose:
- print('--- epoch =', epoch, ' cumulated reconstruction error: ', cumulated_loss_over_epoch)
-
- return nx.from_numpy(Cdict_best_state), log
-
-
-def _initialize_adam_optimizer(variable):
-
- # Initialization for our numpy implementation of adam optimizer
- atoms_adam_m = np.zeros_like(variable) # Initialize first moment tensor
- atoms_adam_v = np.zeros_like(variable) # Initialize second moment tensor
- atoms_adam_count = 1
-
- return {'mean': atoms_adam_m, 'var': atoms_adam_v, 'count': atoms_adam_count}
-
-
-def _adam_stochastic_updates(variable, grad, learning_rate, adam_moments, beta_1=0.9, beta_2=0.99, eps=1e-09):
-
- adam_moments['mean'] = beta_1 * adam_moments['mean'] + (1 - beta_1) * grad
- adam_moments['var'] = beta_2 * adam_moments['var'] + (1 - beta_2) * (grad**2)
- unbiased_m = adam_moments['mean'] / (1 - beta_1**adam_moments['count'])
- unbiased_v = adam_moments['var'] / (1 - beta_2**adam_moments['count'])
- variable -= learning_rate * unbiased_m / (np.sqrt(unbiased_v) + eps)
- adam_moments['count'] += 1
-
- return variable, adam_moments
-
-
-def gromov_wasserstein_linear_unmixing(C, Cdict, reg=0., p=None, q=None, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, **kwargs):
- r"""
- Returns the Gromov-Wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary :math:`\{ (\mathbf{C_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}`.
-
- .. math::
- \min_{ \mathbf{w}} GW_2(\mathbf{C}, \sum_{d=1}^D w_d\mathbf{C_{dict}[d]}, \mathbf{p}, \mathbf{q}) - reg \| \mathbf{w} \|_2^2
-
- such that:
-
- - :math:`\mathbf{w}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\mathbf{C}` is the (ns,ns) pairwise similarity matrix.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrices of size nt.
- - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights.
- - reg is the regularization coefficient.
-
- The algorithm used for solving the problem is a Block Coordinate Descent as discussed in [38], algorithm 1.
-
- Parameters
- ----------
- C : array-like, shape (ns, ns)
- Metric/Graph cost matrix.
- Cdict : D array-like, shape (D,nt,nt)
- Metric/Graph cost matrices composing the dictionary on which to embed C.
- reg : float, optional.
- Coefficient of the negative quadratic regularization used to promote sparsity of w. Default is 0.
- p : array-like, shape (ns,), optional
- Distribution in the source space C. Default is None and corresponds to uniform distribution.
- q : array-like, shape (nt,), optional
- Distribution in the space depicted by the dictionary. Default is None and corresponds to uniform distribution.
- tol_outer : float, optional
- Solver precision for the BCD algorithm.
- tol_inner : float, optional
- Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport. Default is :math:`10^{-5}`.
- max_iter_outer : int, optional
- Maximum number of iterations for the BCD. Default is 20.
- max_iter_inner : int, optional
- Maximum number of iterations for the Conjugate Gradient. Default is 200.
-
- Returns
- -------
- w: array-like, shape (D,)
- gromov-wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the span of the dictionary.
- Cembedded: array-like, shape (nt,nt)
- embedded structure of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{C_{dict}[d]}`.
- T: array-like (ns, nt)
- Gromov-Wasserstein transport plan between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w_d\mathbf{C_{dict}[d]}, \mathbf{q})`
- current_loss: float
- reconstruction error
- References
- -------
-
- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty.
- "Online Graph Dictionary Learning"
- International Conference on Machine Learning (ICML). 2021.
- """
- C0, Cdict0 = C, Cdict
- nx = get_backend(C0, Cdict0)
- C = nx.to_numpy(C0)
- Cdict = nx.to_numpy(Cdict0)
- if p is None:
- p = unif(C.shape[0])
- else:
- p = nx.to_numpy(p)
-
- if q is None:
- q = unif(Cdict.shape[-1])
- else:
- q = nx.to_numpy(q)
-
- T = p[:, None] * q[None, :]
- D = len(Cdict)
-
- w = unif(D) # Initialize uniformly the unmixing w
- Cembedded = np.sum(w[:, None, None] * Cdict, axis=0)
-
- const_q = q[:, None] * q[None, :]
- # Trackers for BCD convergence
- convergence_criterion = np.inf
- current_loss = 10**15
- outer_count = 0
-
- while (convergence_criterion > tol_outer) and (outer_count < max_iter_outer):
- previous_loss = current_loss
- # 1. Solve GW transport between (C,p) and (\sum_d Cdictionary[d],q) fixing the unmixing w
- T, log = gromov_wasserstein(C1=C, C2=Cembedded, p=p, q=q, loss_fun='square_loss', G0=T, log=True, armijo=False, **kwargs)
- current_loss = log['gw_dist']
- if reg != 0:
- current_loss -= reg * np.sum(w**2)
-
- # 2. Solve linear unmixing problem over w with a fixed transport plan T
- w, Cembedded, current_loss = _cg_gromov_wasserstein_unmixing(
- C=C, Cdict=Cdict, Cembedded=Cembedded, w=w, const_q=const_q, T=T,
- starting_loss=current_loss, reg=reg, tol=tol_inner, max_iter=max_iter_inner, **kwargs
- )
-
- if previous_loss != 0:
- convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss)
- else: # handle numerical issues around 0
- convergence_criterion = abs(previous_loss - current_loss) / 10**(-15)
- outer_count += 1
-
- return nx.from_numpy(w), nx.from_numpy(Cembedded), nx.from_numpy(T), nx.from_numpy(current_loss)
-
-
-def _cg_gromov_wasserstein_unmixing(C, Cdict, Cembedded, w, const_q, T, starting_loss, reg=0., tol=10**(-5), max_iter=200, **kwargs):
- r"""
- Returns for a fixed admissible transport plan,
- the linear unmixing w minimizing the Gromov-Wasserstein cost between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w[d]*\mathbf{C_{dict}[d]}, \mathbf{q})`
-
- .. math::
- \min_{\mathbf{w}} \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D w_d*C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} - reg* \| \mathbf{w} \|_2^2
-
-
- Such that:
-
- - :math:`\mathbf{w}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\mathbf{C}` is the (ns,ns) pairwise similarity matrix.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrices of nt points.
- - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights.
- - :math:`\mathbf{w}` is the linear unmixing of :math:`(\mathbf{C}, \mathbf{p})` onto :math:`(\sum_d w_d \mathbf{Cdict[d]}, \mathbf{q})`.
- - :math:`\mathbf{T}` is the optimal transport plan conditioned by the current state of :math:`\mathbf{w}`.
- - reg is the regularization coefficient.
-
- The algorithm used for solving the problem is a Conditional Gradient Descent as discussed in [38]
-
- Parameters
- ----------
-
- C : array-like, shape (ns, ns)
- Metric/Graph cost matrix.
- Cdict : list of D array-like, shape (nt,nt)
- Metric/Graph cost matrices composing the dictionary on which to embed C.
- Each matrix in the dictionary must have the same size (nt,nt).
- Cembedded: array-like, shape (nt,nt)
- Embedded structure :math:`(\sum_d w[d]*Cdict[d],q)` of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary. Used to avoid redundant computations.
- w: array-like, shape (D,)
- Linear unmixing of the input structure onto the dictionary
- const_q: array-like, shape (nt,nt)
- product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations.
- T: array-like, shape (ns,nt)
- fixed transport plan between the input structure and its representation in the dictionary.
- p : array-like, shape (ns,)
- Distribution in the source space.
- q : array-like, shape (nt,)
- Distribution in the embedding space depicted by the dictionary.
- reg : float, optional.
- Coefficient of the negative quadratic regularization used to promote sparsity of w. Default is 0.
-
- Returns
- -------
- w: ndarray (D,)
- optimal unmixing of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary span given OT starting from previously optimal unmixing.
- """
- convergence_criterion = np.inf
- current_loss = starting_loss
- count = 0
- const_TCT = np.transpose(C.dot(T)).dot(T)
-
- while (convergence_criterion > tol) and (count < max_iter):
-
- previous_loss = current_loss
- # 1) Compute gradient at current point w
- grad_w = 2 * np.sum(Cdict * (Cembedded[None, :, :] * const_q[None, :, :] - const_TCT[None, :, :]), axis=(1, 2))
- grad_w -= 2 * reg * w
-
- # 2) Conditional gradient direction finding: x= \argmin_x x^T.grad_w
- min_ = np.min(grad_w)
- x = (grad_w == min_).astype(np.float64)
- x /= np.sum(x)
-
- # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c
- gamma, a, b, Cembedded_diff = _linesearch_gromov_wasserstein_unmixing(w, grad_w, x, Cdict, Cembedded, const_q, const_TCT, reg)
-
- # 4) Updates: w <-- (1-gamma)*w + gamma*x
- w += gamma * (x - w)
- Cembedded += gamma * Cembedded_diff
- current_loss += a * (gamma**2) + b * gamma
-
- if previous_loss != 0: # not that the loss can be negative if reg >0
- convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss)
- else: # handle numerical issues around 0
- convergence_criterion = abs(previous_loss - current_loss) / 10**(-15)
- count += 1
-
- return w, Cembedded, current_loss
-
-
-def _linesearch_gromov_wasserstein_unmixing(w, grad_w, x, Cdict, Cembedded, const_q, const_TCT, reg, **kwargs):
- r"""
- Compute optimal steps for the line search problem of Gromov-Wasserstein linear unmixing
- .. math::
- \min_{\gamma \in [0,1]} \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D z_d(\gamma)C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} - reg\| \mathbf{z}(\gamma) \|_2^2
-
-
- Such that:
-
- - :math:`\mathbf{z}(\gamma) = (1- \gamma)\mathbf{w} + \gamma \mathbf{x}`
-
- Parameters
- ----------
-
- w : array-like, shape (D,)
- Unmixing.
- grad_w : array-like, shape (D, D)
- Gradient of the reconstruction loss with respect to w.
- x: array-like, shape (D,)
- Conditional gradient direction.
- Cdict : list of D array-like, shape (nt,nt)
- Metric/Graph cost matrices composing the dictionary on which to embed C.
- Each matrix in the dictionary must have the same size (nt,nt).
- Cembedded: array-like, shape (nt,nt)
- Embedded structure :math:`(\sum_d w_dCdict[d],q)` of :math:`(\mathbf{C},\mathbf{p})` onto the dictionary. Used to avoid redundant computations.
- const_q: array-like, shape (nt,nt)
- product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations.
- const_TCT: array-like, shape (nt, nt)
- :math:`\mathbf{T}^\top \mathbf{C}^\top \mathbf{T}`. Used to avoid redundant computations.
- Returns
- -------
- gamma: float
- Optimal value for the line-search step
- a: float
- Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss
- b: float
- Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss
- Cembedded_diff: numpy array, shape (nt, nt)
- Difference between models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`.
- reg : float, optional.
- Coefficient of the negative quadratic regularization used to promote sparsity of :math:`\mathbf{w}`.
- """
-
- # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c
- Cembedded_x = np.sum(x[:, None, None] * Cdict, axis=0)
- Cembedded_diff = Cembedded_x - Cembedded
- trace_diffx = np.sum(Cembedded_diff * Cembedded_x * const_q)
- trace_diffw = np.sum(Cembedded_diff * Cembedded * const_q)
- a = trace_diffx - trace_diffw
- b = 2 * (trace_diffw - np.sum(Cembedded_diff * const_TCT))
- if reg != 0:
- a -= reg * np.sum((x - w)**2)
- b -= 2 * reg * np.sum(w * (x - w))
-
- if a > 0:
- gamma = min(1, max(0, - b / (2 * a)))
- elif a + b < 0:
- gamma = 1
- else:
- gamma = 0
-
- return gamma, a, b, Cembedded_diff
-
-
-def fused_gromov_wasserstein_dictionary_learning(Cs, Ys, D, nt, alpha, reg=0., ps=None, q=None, epochs=20, batch_size=32, learning_rate_C=1., learning_rate_Y=1.,
- Cdict_init=None, Ydict_init=None, projection='nonnegative_symmetric', use_log=False,
- tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, use_adam_optimizer=True, verbose=False, **kwargs):
- r"""
- Infer Fused Gromov-Wasserstein linear dictionary :math:`\{ (\mathbf{C_{dict}[d]}, \mathbf{Y_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}` from the list of S attributed structures :math:`\{ (\mathbf{C_s}, \mathbf{Y_s},\mathbf{p_s}) \}_s`
-
- .. math::
- \min_{\mathbf{C_{dict}},\mathbf{Y_{dict}}, \{\mathbf{w_s}\}_{s}} \sum_{s=1}^S FGW_{2,\alpha}(\mathbf{C_s}, \mathbf{Y_s}, \sum_{d=1}^D w_{s,d}\mathbf{C_{dict}[d]},\sum_{d=1}^D w_{s,d}\mathbf{Y_{dict}[d]}, \mathbf{p_s}, \mathbf{q}) \\ - reg\| \mathbf{w_s} \|_2^2
-
-
- Such that :math:`\forall s \leq S` :
-
- - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w_s} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\forall s \leq S, \mathbf{C_s}` is a (ns,ns) pairwise similarity matrix of variable size ns.
- - :math:`\forall s \leq S, \mathbf{Y_s}` is a (ns,d) features matrix of variable size ns and fixed dimension d.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt.
- - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d.
- - :math:`\forall s \leq S, \mathbf{p_s}` is the source distribution corresponding to :math:`\mathbf{C_s}`
- - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space.
- - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein
- - reg is the regularization coefficient.
-
-
- The stochastic algorithm used for estimating the attributed graph dictionary atoms as proposed in [38]
-
- Parameters
- ----------
- Cs : list of S symmetric array-like, shape (ns, ns)
- List of Metric/Graph cost matrices of variable size (ns,ns).
- Ys : list of S array-like, shape (ns, d)
- List of feature matrix of variable size (ns,d) with d fixed.
- D: int
- Number of dictionary atoms to learn
- nt: int
- Number of samples within each dictionary atoms
- alpha : float
- Trade-off parameter of Fused Gromov-Wasserstein
- reg : float, optional
- Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0.
- ps : list of S array-like, shape (ns,), optional
- Distribution in each source space C of Cs. Default is None and corresponds to uniform distibutions.
- q : array-like, shape (nt,), optional
- Distribution in the embedding space whose structure will be learned. Default is None and corresponds to uniform distributions.
- epochs: int, optional
- Number of epochs used to learn the dictionary. Default is 32.
- batch_size: int, optional
- Batch size for each stochastic gradient update of the dictionary. Set to the dataset size if the provided batch_size is higher than the dataset size. Default is 32.
- learning_rate_C: float, optional
- Learning rate used for the stochastic gradient descent on Cdict. Default is 1.
- learning_rate_Y: float, optional
- Learning rate used for the stochastic gradient descent on Ydict. Default is 1.
- Cdict_init: list of D array-like with shape (nt, nt), optional
- Used to initialize the dictionary structures Cdict.
- If set to None (Default), the dictionary will be initialized randomly.
- Else Cdict must have shape (D, nt, nt) i.e match provided shape features.
- Ydict_init: list of D array-like with shape (nt, d), optional
- Used to initialize the dictionary features Ydict.
- If set to None, the dictionary features will be initialized randomly.
- Else Ydict must have shape (D, nt, d) where d is the features dimension of inputs Ys and also match provided shape features.
- projection: str, optional
- If 'nonnegative' and/or 'symmetric' is in projection, the corresponding projection will be performed at each stochastic update of the dictionary
- Else the set of atoms is :math:`R^{nt * nt}`. Default is 'nonnegative_symmetric'
- log: bool, optional
- If set to True, losses evolution by batches and epochs are tracked. Default is False.
- use_adam_optimizer: bool, optional
- If set to True, adam optimizer with default settings is used as adaptative learning rate strategy.
- Else perform SGD with fixed learning rate. Default is True.
- tol_outer : float, optional
- Solver precision for the BCD algorithm, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`.
- tol_inner : float, optional
- Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport, measured by absolute relative error on consecutive losses. Default is :math:`10^{-5}`.
- max_iter_outer : int, optional
- Maximum number of iterations for the BCD. Default is 20.
- max_iter_inner : int, optional
- Maximum number of iterations for the Conjugate Gradient. Default is 200.
- verbose : bool, optional
- Print the reconstruction loss every epoch. Default is False.
-
- Returns
- -------
-
- Cdict_best_state : D array-like, shape (D,nt,nt)
- Metric/Graph cost matrices composing the dictionary.
- The dictionary leading to the best loss over an epoch is saved and returned.
- Ydict_best_state : D array-like, shape (D,nt,d)
- Feature matrices composing the dictionary.
- The dictionary leading to the best loss over an epoch is saved and returned.
- log: dict
- If use_log is True, contains loss evolutions by batches and epoches.
- References
- -------
-
- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty.
- "Online Graph Dictionary Learning"
- International Conference on Machine Learning (ICML). 2021.
- """
- Cs0, Ys0 = Cs, Ys
- nx = get_backend(*Cs0, *Ys0)
- Cs = [nx.to_numpy(C) for C in Cs0]
- Ys = [nx.to_numpy(Y) for Y in Ys0]
-
- d = Ys[0].shape[-1]
- dataset_size = len(Cs)
-
- if ps is None:
- ps = [unif(C.shape[0]) for C in Cs]
- else:
- ps = [nx.to_numpy(p) for p in ps]
- if q is None:
- q = unif(nt)
- else:
- q = nx.to_numpy(q)
-
- if Cdict_init is None:
- # Initialize randomly structures of dictionary atoms based on samples
- dataset_means = [C.mean() for C in Cs]
- Cdict = np.random.normal(loc=np.mean(dataset_means), scale=np.std(dataset_means), size=(D, nt, nt))
- else:
- Cdict = nx.to_numpy(Cdict_init).copy()
- assert Cdict.shape == (D, nt, nt)
- if Ydict_init is None:
- # Initialize randomly features of dictionary atoms based on samples distribution by feature component
- dataset_feature_means = np.stack([F.mean(axis=0) for F in Ys])
- Ydict = np.random.normal(loc=dataset_feature_means.mean(axis=0), scale=dataset_feature_means.std(axis=0), size=(D, nt, d))
- else:
- Ydict = nx.to_numpy(Ydict_init).copy()
- assert Ydict.shape == (D, nt, d)
-
- if 'symmetric' in projection:
- Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1)))
- if 'nonnegative' in projection:
- Cdict[Cdict < 0.] = 0.
-
- if use_adam_optimizer:
- adam_moments_C = _initialize_adam_optimizer(Cdict)
- adam_moments_Y = _initialize_adam_optimizer(Ydict)
-
- log = {'loss_batches': [], 'loss_epochs': []}
- const_q = q[:, None] * q[None, :]
- diag_q = np.diag(q)
- Cdict_best_state = Cdict.copy()
- Ydict_best_state = Ydict.copy()
- loss_best_state = np.inf
- if batch_size > dataset_size:
- batch_size = dataset_size
- iter_by_epoch = dataset_size // batch_size + int((dataset_size % batch_size) > 0)
-
- for epoch in range(epochs):
- cumulated_loss_over_epoch = 0.
-
- for _ in range(iter_by_epoch):
-
- # Batch iterations
- batch = np.random.choice(range(dataset_size), size=batch_size, replace=False)
- cumulated_loss_over_batch = 0.
- unmixings = np.zeros((batch_size, D))
- Cs_embedded = np.zeros((batch_size, nt, nt))
- Ys_embedded = np.zeros((batch_size, nt, d))
- Ts = [None] * batch_size
-
- for batch_idx, C_idx in enumerate(batch):
- # BCD solver for Gromov-Wassersteisn linear unmixing used independently on each structure of the sampled batch
- unmixings[batch_idx], Cs_embedded[batch_idx], Ys_embedded[batch_idx], Ts[batch_idx], current_loss = fused_gromov_wasserstein_linear_unmixing(
- Cs[C_idx], Ys[C_idx], Cdict, Ydict, alpha, reg=reg, p=ps[C_idx], q=q,
- tol_outer=tol_outer, tol_inner=tol_inner, max_iter_outer=max_iter_outer, max_iter_inner=max_iter_inner
- )
- cumulated_loss_over_batch += current_loss
- cumulated_loss_over_epoch += cumulated_loss_over_batch
- if use_log:
- log['loss_batches'].append(cumulated_loss_over_batch)
-
- # Stochastic projected gradient step over dictionary atoms
- grad_Cdict = np.zeros_like(Cdict)
- grad_Ydict = np.zeros_like(Ydict)
-
- for batch_idx, C_idx in enumerate(batch):
- shared_term_structures = Cs_embedded[batch_idx] * const_q - (Cs[C_idx].dot(Ts[batch_idx])).T.dot(Ts[batch_idx])
- shared_term_features = diag_q.dot(Ys_embedded[batch_idx]) - Ts[batch_idx].T.dot(Ys[C_idx])
- grad_Cdict += alpha * unmixings[batch_idx][:, None, None] * shared_term_structures[None, :, :]
- grad_Ydict += (1 - alpha) * unmixings[batch_idx][:, None, None] * shared_term_features[None, :, :]
- grad_Cdict *= 2 / batch_size
- grad_Ydict *= 2 / batch_size
-
- if use_adam_optimizer:
- Cdict, adam_moments_C = _adam_stochastic_updates(Cdict, grad_Cdict, learning_rate_C, adam_moments_C)
- Ydict, adam_moments_Y = _adam_stochastic_updates(Ydict, grad_Ydict, learning_rate_Y, adam_moments_Y)
- else:
- Cdict -= learning_rate_C * grad_Cdict
- Ydict -= learning_rate_Y * grad_Ydict
-
- if 'symmetric' in projection:
- Cdict = 0.5 * (Cdict + Cdict.transpose((0, 2, 1)))
- if 'nonnegative' in projection:
- Cdict[Cdict < 0.] = 0.
-
- if use_log:
- log['loss_epochs'].append(cumulated_loss_over_epoch)
- if loss_best_state > cumulated_loss_over_epoch:
- loss_best_state = cumulated_loss_over_epoch
- Cdict_best_state = Cdict.copy()
- Ydict_best_state = Ydict.copy()
- if verbose:
- print('--- epoch: ', epoch, ' cumulated reconstruction error: ', cumulated_loss_over_epoch)
-
- return nx.from_numpy(Cdict_best_state), nx.from_numpy(Ydict_best_state), log
-
-
-def fused_gromov_wasserstein_linear_unmixing(C, Y, Cdict, Ydict, alpha, reg=0., p=None, q=None, tol_outer=10**(-5), tol_inner=10**(-5), max_iter_outer=20, max_iter_inner=200, **kwargs):
- r"""
- Returns the Fused Gromov-Wasserstein linear unmixing of :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` onto the attributed dictionary atoms :math:`\{ (\mathbf{C_{dict}[d]},\mathbf{Y_{dict}[d]}, \mathbf{q}) \}_{d \in [D]}`
-
- .. math::
- \min_{\mathbf{w}} FGW_{2,\alpha}(\mathbf{C},\mathbf{Y}, \sum_{d=1}^D w_d\mathbf{C_{dict}[d]},\sum_{d=1}^D w_d\mathbf{Y_{dict}[d]}, \mathbf{p}, \mathbf{q}) - reg \| \mathbf{w} \|_2^2
-
- such that, :math:`\forall s \leq S` :
-
- - :math:`\mathbf{w_s}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w_s} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\mathbf{C}` is a (ns,ns) pairwise similarity matrix of variable size ns.
- - :math:`\mathbf{Y}` is a (ns,d) features matrix of variable size ns and fixed dimension d.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt.
- - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d.
- - :math:`\mathbf{p}` is the source distribution corresponding to :math:`\mathbf{C_s}`
- - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space.
- - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein
- - reg is the regularization coefficient.
-
- The algorithm used for solving the problem is a Block Coordinate Descent as discussed in [38], algorithm 6.
-
- Parameters
- ----------
- C : array-like, shape (ns, ns)
- Metric/Graph cost matrix.
- Y : array-like, shape (ns, d)
- Feature matrix.
- Cdict : D array-like, shape (D,nt,nt)
- Metric/Graph cost matrices composing the dictionary on which to embed (C,Y).
- Ydict : D array-like, shape (D,nt,d)
- Feature matrices composing the dictionary on which to embed (C,Y).
- alpha: float,
- Trade-off parameter of Fused Gromov-Wasserstein.
- reg : float, optional
- Coefficient of the negative quadratic regularization used to promote sparsity of w. The default is 0.
- p : array-like, shape (ns,), optional
- Distribution in the source space C. Default is None and corresponds to uniform distribution.
- q : array-like, shape (nt,), optional
- Distribution in the space depicted by the dictionary. Default is None and corresponds to uniform distribution.
- tol_outer : float, optional
- Solver precision for the BCD algorithm.
- tol_inner : float, optional
- Solver precision for the Conjugate Gradient algorithm used to get optimal w at a fixed transport. Default is :math:`10^{-5}`.
- max_iter_outer : int, optional
- Maximum number of iterations for the BCD. Default is 20.
- max_iter_inner : int, optional
- Maximum number of iterations for the Conjugate Gradient. Default is 200.
-
- Returns
- -------
- w: array-like, shape (D,)
- fused gromov-wasserstein linear unmixing of (C,Y,p) onto the span of the dictionary.
- Cembedded: array-like, shape (nt,nt)
- embedded structure of :math:`(\mathbf{C},\mathbf{Y}, \mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{C_{dict}[d]}`.
- Yembedded: array-like, shape (nt,d)
- embedded features of :math:`(\mathbf{C},\mathbf{Y}, \mathbf{p})` onto the dictionary, :math:`\sum_d w_d\mathbf{Y_{dict}[d]}`.
- T: array-like (ns,nt)
- Fused Gromov-Wasserstein transport plan between :math:`(\mathbf{C},\mathbf{p})` and :math:`(\sum_d w_d\mathbf{C_{dict}[d]}, \sum_d w_d\mathbf{Y_{dict}[d]},\mathbf{q})`.
- current_loss: float
- reconstruction error
- References
- -------
-
- ..[38] Cédric Vincent-Cuaz, Titouan Vayer, Rémi Flamary, Marco Corneli, Nicolas Courty.
- "Online Graph Dictionary Learning"
- International Conference on Machine Learning (ICML). 2021.
- """
- C0, Y0, Cdict0, Ydict0 = C, Y, Cdict, Ydict
- nx = get_backend(C0, Y0, Cdict0, Ydict0)
- C = nx.to_numpy(C0)
- Y = nx.to_numpy(Y0)
- Cdict = nx.to_numpy(Cdict0)
- Ydict = nx.to_numpy(Ydict0)
-
- if p is None:
- p = unif(C.shape[0])
- else:
- p = nx.to_numpy(p)
- if q is None:
- q = unif(Cdict.shape[-1])
- else:
- q = nx.to_numpy(q)
-
- T = p[:, None] * q[None, :]
- D = len(Cdict)
- d = Y.shape[-1]
- w = unif(D) # Initialize with uniform weights
- ns = C.shape[-1]
- nt = Cdict.shape[-1]
-
- # modeling (C,Y)
- Cembedded = np.sum(w[:, None, None] * Cdict, axis=0)
- Yembedded = np.sum(w[:, None, None] * Ydict, axis=0)
-
- # constants depending on q
- const_q = q[:, None] * q[None, :]
- diag_q = np.diag(q)
- # Trackers for BCD convergence
- convergence_criterion = np.inf
- current_loss = 10**15
- outer_count = 0
- Ys_constM = (Y**2).dot(np.ones((d, nt))) # constant in computing euclidean pairwise feature matrix
-
- while (convergence_criterion > tol_outer) and (outer_count < max_iter_outer):
- previous_loss = current_loss
-
- # 1. Solve GW transport between (C,p) and (\sum_d Cdictionary[d],q) fixing the unmixing w
- Yt_varM = (np.ones((ns, d))).dot((Yembedded**2).T)
- M = Ys_constM + Yt_varM - 2 * Y.dot(Yembedded.T) # euclidean distance matrix between features
- T, log = fused_gromov_wasserstein(M, C, Cembedded, p, q, loss_fun='square_loss', alpha=alpha, armijo=False, G0=T, log=True)
- current_loss = log['fgw_dist']
- if reg != 0:
- current_loss -= reg * np.sum(w**2)
-
- # 2. Solve linear unmixing problem over w with a fixed transport plan T
- w, Cembedded, Yembedded, current_loss = _cg_fused_gromov_wasserstein_unmixing(C, Y, Cdict, Ydict, Cembedded, Yembedded, w,
- T, p, q, const_q, diag_q, current_loss, alpha, reg,
- tol=tol_inner, max_iter=max_iter_inner, **kwargs)
- if previous_loss != 0:
- convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss)
- else:
- convergence_criterion = abs(previous_loss - current_loss) / 10**(-12)
- outer_count += 1
-
- return nx.from_numpy(w), nx.from_numpy(Cembedded), nx.from_numpy(Yembedded), nx.from_numpy(T), nx.from_numpy(current_loss)
-
-
-def _cg_fused_gromov_wasserstein_unmixing(C, Y, Cdict, Ydict, Cembedded, Yembedded, w, T, p, q, const_q, diag_q, starting_loss, alpha, reg, tol=10**(-6), max_iter=200, **kwargs):
- r"""
- Returns for a fixed admissible transport plan,
- the optimal linear unmixing :math:`\mathbf{w}` minimizing the Fused Gromov-Wasserstein cost between :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` and :math:`(\sum_d w_d \mathbf{C_{dict}[d]},\sum_d w_d*\mathbf{Y_{dict}[d]}, \mathbf{q})`
-
- .. math::
- \min_{\mathbf{w}} \alpha \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D w_d C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} \\+ (1-\alpha) \sum_{ij} \| \mathbf{Y_i} - \sum_d w_d \mathbf{Y_{dict}[d]_j} \|_2^2 T_{ij}- reg \| \mathbf{w} \|_2^2
-
- Such that :
-
- - :math:`\mathbf{w}^\top \mathbf{1}_D = 1`
- - :math:`\mathbf{w} \geq \mathbf{0}_D`
-
- Where :
-
- - :math:`\mathbf{C}` is a (ns,ns) pairwise similarity matrix of variable size ns.
- - :math:`\mathbf{Y}` is a (ns,d) features matrix of variable size ns and fixed dimension d.
- - :math:`\mathbf{C_{dict}}` is a (D, nt, nt) tensor of D pairwise similarity matrix of fixed size nt.
- - :math:`\mathbf{Y_{dict}}` is a (D, nt, d) tensor of D features matrix of fixed size nt and fixed dimension d.
- - :math:`\mathbf{p}` is the source distribution corresponding to :math:`\mathbf{C_s}`
- - :math:`\mathbf{q}` is the target distribution assigned to every structures in the embedding space.
- - :math:`\mathbf{T}` is the optimal transport plan conditioned by the previous state of :math:`\mathbf{w}`
- - :math:`\alpha` is the trade-off parameter of Fused Gromov-Wasserstein
- - reg is the regularization coefficient.
-
- The algorithm used for solving the problem is a Conditional Gradient Descent as discussed in [38], algorithm 7.
-
- Parameters
- ----------
-
- C : array-like, shape (ns, ns)
- Metric/Graph cost matrix.
- Y : array-like, shape (ns, d)
- Feature matrix.
- Cdict : list of D array-like, shape (nt,nt)
- Metric/Graph cost matrices composing the dictionary on which to embed (C,Y).
- Each matrix in the dictionary must have the same size (nt,nt).
- Ydict : list of D array-like, shape (nt,d)
- Feature matrices composing the dictionary on which to embed (C,Y).
- Each matrix in the dictionary must have the same size (nt,d).
- Cembedded: array-like, shape (nt,nt)
- Embedded structure of (C,Y) onto the dictionary
- Yembedded: array-like, shape (nt,d)
- Embedded features of (C,Y) onto the dictionary
- w: array-like, shape (n_D,)
- Linear unmixing of (C,Y) onto (Cdict,Ydict)
- const_q: array-like, shape (nt,nt)
- product matrix :math:`\mathbf{qq}^\top` where :math:`\mathbf{q}` is the target space distribution.
- diag_q: array-like, shape (nt,nt)
- diagonal matrix with values of q on the diagonal.
- T: array-like, shape (ns,nt)
- fixed transport plan between (C,Y) and its model
- p : array-like, shape (ns,)
- Distribution in the source space (C,Y).
- q : array-like, shape (nt,)
- Distribution in the embedding space depicted by the dictionary.
- alpha: float,
- Trade-off parameter of Fused Gromov-Wasserstein.
- reg : float, optional
- Coefficient of the negative quadratic regularization used to promote sparsity of w.
-
- Returns
- -------
- w: ndarray (D,)
- linear unmixing of :math:`(\mathbf{C},\mathbf{Y},\mathbf{p})` onto the span of :math:`(C_{dict},Y_{dict})` given OT corresponding to previous unmixing.
- """
- convergence_criterion = np.inf
- current_loss = starting_loss
- count = 0
- const_TCT = np.transpose(C.dot(T)).dot(T)
- ones_ns_d = np.ones(Y.shape)
-
- while (convergence_criterion > tol) and (count < max_iter):
- previous_loss = current_loss
-
- # 1) Compute gradient at current point w
- # structure
- grad_w = alpha * np.sum(Cdict * (Cembedded[None, :, :] * const_q[None, :, :] - const_TCT[None, :, :]), axis=(1, 2))
- # feature
- grad_w += (1 - alpha) * np.sum(Ydict * (diag_q.dot(Yembedded)[None, :, :] - T.T.dot(Y)[None, :, :]), axis=(1, 2))
- grad_w -= reg * w
- grad_w *= 2
-
- # 2) Conditional gradient direction finding: x= \argmin_x x^T.grad_w
- min_ = np.min(grad_w)
- x = (grad_w == min_).astype(np.float64)
- x /= np.sum(x)
-
- # 3) Line-search step: solve \argmin_{\gamma \in [0,1]} a*gamma^2 + b*gamma + c
- gamma, a, b, Cembedded_diff, Yembedded_diff = _linesearch_fused_gromov_wasserstein_unmixing(w, grad_w, x, Y, Cdict, Ydict, Cembedded, Yembedded, T, const_q, const_TCT, ones_ns_d, alpha, reg)
-
- # 4) Updates: w <-- (1-gamma)*w + gamma*x
- w += gamma * (x - w)
- Cembedded += gamma * Cembedded_diff
- Yembedded += gamma * Yembedded_diff
- current_loss += a * (gamma**2) + b * gamma
-
- if previous_loss != 0:
- convergence_criterion = abs(previous_loss - current_loss) / abs(previous_loss)
- else:
- convergence_criterion = abs(previous_loss - current_loss) / 10**(-12)
- count += 1
-
- return w, Cembedded, Yembedded, current_loss
-
-
-def _linesearch_fused_gromov_wasserstein_unmixing(w, grad_w, x, Y, Cdict, Ydict, Cembedded, Yembedded, T, const_q, const_TCT, ones_ns_d, alpha, reg, **kwargs):
- r"""
- Compute optimal steps for the line search problem of Fused Gromov-Wasserstein linear unmixing
- .. math::
- \min_{\gamma \in [0,1]} \alpha \sum_{ijkl} (C_{i,j} - \sum_{d=1}^D z_d(\gamma)C_{dict}[d]_{k,l} )^2 T_{i,k}T_{j,l} \\ + (1-\alpha) \sum_{ij} \| \mathbf{Y_i} - \sum_d z_d(\gamma) \mathbf{Y_{dict}[d]_j} \|_2^2 - reg\| \mathbf{z}(\gamma) \|_2^2
-
-
- Such that :
-
- - :math:`\mathbf{z}(\gamma) = (1- \gamma)\mathbf{w} + \gamma \mathbf{x}`
-
- Parameters
- ----------
-
- w : array-like, shape (D,)
- Unmixing.
- grad_w : array-like, shape (D, D)
- Gradient of the reconstruction loss with respect to w.
- x: array-like, shape (D,)
- Conditional gradient direction.
- Y: arrat-like, shape (ns,d)
- Feature matrix of the input space
- Cdict : list of D array-like, shape (nt, nt)
- Metric/Graph cost matrices composing the dictionary on which to embed (C,Y).
- Each matrix in the dictionary must have the same size (nt,nt).
- Ydict : list of D array-like, shape (nt, d)
- Feature matrices composing the dictionary on which to embed (C,Y).
- Each matrix in the dictionary must have the same size (nt,d).
- Cembedded: array-like, shape (nt, nt)
- Embedded structure of (C,Y) onto the dictionary
- Yembedded: array-like, shape (nt, d)
- Embedded features of (C,Y) onto the dictionary
- T: array-like, shape (ns, nt)
- Fixed transport plan between (C,Y) and its current model.
- const_q: array-like, shape (nt,nt)
- product matrix :math:`\mathbf{q}\mathbf{q}^\top` where q is the target space distribution. Used to avoid redundant computations.
- const_TCT: array-like, shape (nt, nt)
- :math:`\mathbf{T}^\top \mathbf{C}^\top \mathbf{T}`. Used to avoid redundant computations.
- ones_ns_d: array-like, shape (ns, d)
- :math:`\mathbf{1}_{ ns \times d}`. Used to avoid redundant computations.
- alpha: float,
- Trade-off parameter of Fused Gromov-Wasserstein.
- reg : float, optional
- Coefficient of the negative quadratic regularization used to promote sparsity of w.
-
- Returns
- -------
- gamma: float
- Optimal value for the line-search step
- a: float
- Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss
- b: float
- Constant factor appearing in the factorization :math:`a \gamma^2 + b \gamma +c` of the reconstruction loss
- Cembedded_diff: numpy array, shape (nt, nt)
- Difference between structure matrix of models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`.
- Yembedded_diff: numpy array, shape (nt, nt)
- Difference between feature matrix of models evaluated in :math:`\mathbf{w}` and in :math:`\mathbf{w}`.
- """
- # polynomial coefficients from quadratic objective (with respect to w) on structures
- Cembedded_x = np.sum(x[:, None, None] * Cdict, axis=0)
- Cembedded_diff = Cembedded_x - Cembedded
- trace_diffx = np.sum(Cembedded_diff * Cembedded_x * const_q)
- trace_diffw = np.sum(Cembedded_diff * Cembedded * const_q)
- # Constant factor appearing in the factorization a*gamma^2 + b*g + c of the Gromov-Wasserstein reconstruction loss
- a_gw = trace_diffx - trace_diffw
- b_gw = 2 * (trace_diffw - np.sum(Cembedded_diff * const_TCT))
-
- # polynomial coefficient from quadratic objective (with respect to w) on features
- Yembedded_x = np.sum(x[:, None, None] * Ydict, axis=0)
- Yembedded_diff = Yembedded_x - Yembedded
- # Constant factor appearing in the factorization a*gamma^2 + b*g + c of the Gromov-Wasserstein reconstruction loss
- a_w = np.sum(ones_ns_d.dot((Yembedded_diff**2).T) * T)
- b_w = 2 * np.sum(T * (ones_ns_d.dot((Yembedded * Yembedded_diff).T) - Y.dot(Yembedded_diff.T)))
-
- a = alpha * a_gw + (1 - alpha) * a_w
- b = alpha * b_gw + (1 - alpha) * b_w
- if reg != 0:
- a -= reg * np.sum((x - w)**2)
- b -= 2 * reg * np.sum(w * (x - w))
- if a > 0:
- gamma = min(1, max(0, -b / (2 * a)))
- elif a + b < 0:
- gamma = 1
- else:
- gamma = 0
-
- return gamma, a, b, Cembedded_diff, Yembedded_diff