summaryrefslogtreecommitdiff
path: root/src/python/gudhi/representations
diff options
context:
space:
mode:
authorMathieuCarriere <mathieu.carriere3@gmail.com>2020-01-31 14:49:59 -0500
committerMathieuCarriere <mathieu.carriere3@gmail.com>2020-01-31 14:49:59 -0500
commita145c7168fdb3f4205cb68870f06fc5cb8e08dea (patch)
tree92b5f5cb051520b13ea7911ae0026c12b28e0cfc /src/python/gudhi/representations
parent85ceea9512634a62664208cd2d0f1ce48bafa171 (diff)
factorization of distance and kernel computations
Diffstat (limited to 'src/python/gudhi/representations')
-rw-r--r--src/python/gudhi/representations/kernel_methods.py131
-rw-r--r--src/python/gudhi/representations/metrics.py247
2 files changed, 193 insertions, 185 deletions
diff --git a/src/python/gudhi/representations/kernel_methods.py b/src/python/gudhi/representations/kernel_methods.py
index bfc83aff..bbbb7c31 100644
--- a/src/python/gudhi/representations/kernel_methods.py
+++ b/src/python/gudhi/representations/kernel_methods.py
@@ -9,13 +9,83 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
-from sklearn.metrics import pairwise_distances
-from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance
+from sklearn.metrics import pairwise_distances, pairwise_kernels
+from .metrics import SlicedWassersteinDistance, PersistenceFisherDistance, sklearn_wrapper, pairwise_persistence_diagram_distances, sliced_wasserstein_distance, persistence_fisher_distance
+from .preprocessing import Padding
#############################################
# Kernel methods ############################
#############################################
+def persistence_weighted_gaussian_kernel(D1, D2, weight=lambda x: 1, kernel_approx=None, bandwidth=1.):
+ """
+ This is a function for computing the persistence weighted Gaussian kernel value from two persistence diagrams. The persistence weighted Gaussian kernel is computed by convolving the persistence diagram points with weighted Gaussian kernels. See http://proceedings.mlr.press/v48/kusano16.html for more details.
+ :param D1: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points (i.e. with infinite coordinate).
+ :param D2: (m x 2) numpy.array encoding the second diagram.
+ :param bandwidth: bandwidth of the Gaussian kernel with which persistence diagrams will be convolved
+ :param weight: weight function for the persistence diagram points. This function must be defined on 2D points, ie lists or numpy arrays of the form [p_x,p_y].
+ :param kernel_approx: kernel approximation class used to speed up computation. Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ :returns: the persistence weighted Gaussian kernel value between persistence diagrams.
+ :rtype: float
+ """
+ ws1 = np.array([weight(D1[j,:]) for j in range(len(D1))])
+ ws2 = np.array([weight(D2[j,:]) for j in range(len(D2))])
+ if kernel_approx is not None:
+ approx1 = np.sum(np.multiply(ws1[:,np.newaxis], kernel_approx.transform(D1)), axis=0)
+ approx2 = np.sum(np.multiply(ws2[:,np.newaxis], kernel_approx.transform(D2)), axis=0)
+ return (1./(np.sqrt(2*np.pi)*bandwidth)) * np.matmul(approx1, approx2.T)
+ else:
+ W = np.matmul(ws1[:,np.newaxis], ws2[np.newaxis,:])
+ E = (1./(np.sqrt(2*np.pi)*bandwidth)) * np.exp(-np.square(pairwise_distances(D1,D2))/(2*bandwidth*bandwidth))
+ return np.sum(np.multiply(W, E))
+
+def persistence_scale_space_kernel(D1, D2, kernel_approx=None, bandwidth=1.):
+ """
+ This is a function for computing the persistence scale space kernel value from two persistence diagrams. The persistence scale space kernel is computed by adding the symmetric to the diagonal of each point in each persistence diagram, with negative weight, and then convolving the points with a Gaussian kernel. See https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Reininghaus_A_Stable_Multi-Scale_2015_CVPR_paper.pdf for more details.
+ :param D1: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points (i.e. with infinite coordinate).
+ :param D2: (m x 2) numpy.array encoding the second diagram.
+ :param bandwidth: bandwidth of the Gaussian kernel with which persistence diagrams will be convolved
+ :param kernel_approx: kernel approximation class used to speed up computation. Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ :returns: the persistence scale space kernel value between persistence diagrams.
+ :rtype: float
+ """
+ DD1 = np.concatenate([D1, D1[:,[1,0]]], axis=0)
+ DD2 = np.concatenate([D2, D2[:,[1,0]]], axis=0)
+ weight_pss = lambda x: 1 if x[1] >= x[0] else -1
+ return 0.5 * persistence_weighted_gaussian_kernel(DD1, DD2, weight=weight_pss, kernel_approx=kernel_approx, bandwidth=bandwidth)
+
+def pairwise_persistence_diagram_kernels(X, Y=None, metric="sliced_wasserstein", **kwargs):
+ """
+ This function computes the kernel matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2).
+ :param X: first list of persistence diagrams.
+ :param Y: second list of persistence diagrams (optional). If None, pairwise kernel values are computed from the first list only.
+ :param metric: kernel to use. It can be either a string ("sliced_wasserstein", "persistence_scale_space", "persistence_weighted_gaussian", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs.
+ :returns: kernel matrix, i.e., numpy array of shape (num diagrams 1 x num diagrams 2)
+ :rtype: float
+ """
+ if Y is None:
+ YY = None
+ pX = Padding(use=True).fit_transform(X)
+ diag_len = len(pX[0])
+ XX = np.reshape(np.vstack(pX), [-1, diag_len*3])
+ else:
+ nX, nY = len(X), len(Y)
+ pD = Padding(use=True).fit_transform(X + Y)
+ diag_len = len(pD[0])
+ XX = np.reshape(np.vstack(pD[:nX]), [-1, diag_len*3])
+ YY = np.reshape(np.vstack(pD[nX:]), [-1, diag_len*3])
+
+ if metric == "sliced_wasserstein":
+ return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="sliced_wasserstein", num_directions=kwargs["num_directions"]) / kwargs["bandwidth"])
+ elif metric == "persistence_fisher":
+ return np.exp(-pairwise_persistence_diagram_distances(X, Y, metric="persistence_fisher", kernel_approx=kwargs["kernel_approx"], bandwidth=kwargs["bandwidth"]) / kwargs["bandwidth_fisher"])
+ elif metric == "persistence_scale_space":
+ return pairwise_kernels(XX, YY, metric=sklearn_wrapper(persistence_scale_space_kernel, **kwargs))
+ elif metric == "persistence_weighted_gaussian":
+ return pairwise_kernels(XX, YY, metric=sklearn_wrapper(persistence_weighted_gaussian_kernel, **kwargs))
+ else:
+ return pairwise_kernels(XX, YY, metric=sklearn_wrapper(metric, **kwargs))
+
class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
"""
This is a class for computing the sliced Wasserstein kernel matrix from a list of persistence diagrams. The sliced Wasserstein kernel is computed by exponentiating the corresponding sliced Wasserstein distance with a Gaussian kernel. See http://proceedings.mlr.press/v70/carriere17a.html for more details.
@@ -29,7 +99,7 @@ class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
num_directions (int): number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the kernel computation (default 10).
"""
self.bandwidth = bandwidth
- self.sw_ = SlicedWassersteinDistance(num_directions=num_directions)
+ self.num_directions = num_directions
def fit(self, X, y=None):
"""
@@ -39,7 +109,7 @@ class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
X (list of n x 2 numpy arrays): input persistence diagrams.
y (n x 1 array): persistence diagram labels (unused).
"""
- self.sw_.fit(X, y)
+ self.diagrams_ = X
return self
def transform(self, X):
@@ -52,7 +122,7 @@ class SlicedWassersteinKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise sliced Wasserstein kernel values.
"""
- return np.exp(-self.sw_.transform(X)/self.bandwidth)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, metric="sliced_wasserstein", bandwidth=self.bandwidth, num_directions=self.num_directions)
class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
"""
@@ -78,10 +148,7 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
X (list of n x 2 numpy arrays): input persistence diagrams.
y (n x 1 array): persistence diagram labels (unused).
"""
- self.diagrams_ = list(X)
- self.ws_ = [ np.array([self.weight(self.diagrams_[i][j,:]) for j in range(self.diagrams_[i].shape[0])]) for i in range(len(self.diagrams_)) ]
- if self.kernel_approx is not None:
- self.approx_ = np.concatenate([np.sum(np.multiply(self.ws_[i][:,np.newaxis], self.kernel_approx.transform(self.diagrams_[i])), axis=0)[np.newaxis,:] for i in range(len(self.diagrams_))])
+ self.diagrams_ = X
return self
def transform(self, X):
@@ -94,31 +161,7 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence weighted Gaussian kernel values.
"""
- Xp = list(X)
- Xfit = np.zeros((len(Xp), len(self.diagrams_)))
- if len(self.diagrams_) == len(Xp) and np.all([np.array_equal(self.diagrams_[i], Xp[i]) for i in range(len(Xp))]):
- if self.kernel_approx is not None:
- Xfit = (1./(np.sqrt(2*np.pi)*self.bandwidth)) * np.matmul(self.approx_, self.approx_.T)
- else:
- for i in range(len(self.diagrams_)):
- for j in range(i+1, len(self.diagrams_)):
- W = np.matmul(self.ws_[i][:,np.newaxis], self.ws_[j][np.newaxis,:])
- E = (1./(np.sqrt(2*np.pi)*self.bandwidth)) * np.exp(-np.square(pairwise_distances(self.diagrams_[i], self.diagrams_[j]))/(2*np.square(self.bandwidth)))
- Xfit[i,j] = np.sum(np.multiply(W, E))
- Xfit[j,i] = Xfit[i,j]
- else:
- ws = [ np.array([self.weight(Xp[i][j,:]) for j in range(Xp[i].shape[0])]) for i in range(len(Xp)) ]
- if self.kernel_approx is not None:
- approx = np.concatenate([np.sum(np.multiply(ws[i][:,np.newaxis], self.kernel_approx.transform(Xp[i])), axis=0)[np.newaxis,:] for i in range(len(Xp))])
- Xfit = (1./(np.sqrt(2*np.pi)*self.bandwidth)) * np.matmul(approx, self.approx_.T)
- else:
- for i in range(len(Xp)):
- for j in range(len(self.diagrams_)):
- W = np.matmul(ws[i][:,np.newaxis], self.ws_[j][np.newaxis,:])
- E = (1./(np.sqrt(2*np.pi)*self.bandwidth)) * np.exp(-np.square(pairwise_distances(Xp[i], self.diagrams_[j]))/(2*np.square(self.bandwidth)))
- Xfit[i,j] = np.sum(np.multiply(W, E))
-
- return Xfit
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, metric="persistence_weighted_gaussian", bandwidth=self.bandwidth, weight=self.weight, kernel_approx=self.kernel_approx)
class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
"""
@@ -132,7 +175,7 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
bandwidth (double): bandwidth of the Gaussian kernel with which persistence diagrams will be convolved (default 1.)
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
"""
- self.pwg_ = PersistenceWeightedGaussianKernel(bandwidth=bandwidth, weight=lambda x: 1 if x[1] >= x[0] else -1, kernel_approx=kernel_approx)
+ self.bandwidth, self.kernel_approx = bandwidth, kernel_approx
def fit(self, X, y=None):
"""
@@ -142,11 +185,7 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
X (list of n x 2 numpy arrays): input persistence diagrams.
y (n x 1 array): persistence diagram labels (unused).
"""
- self.diagrams_ = list(X)
- for i in range(len(self.diagrams_)):
- op_D = self.diagrams_[i][:,[1,0]]
- self.diagrams_[i] = np.concatenate([self.diagrams_[i], op_D], axis=0)
- self.pwg_.fit(X)
+ self.diagrams_ = X
return self
def transform(self, X):
@@ -159,11 +198,7 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence scale space kernel values.
"""
- Xp = list(X)
- for i in range(len(Xp)):
- op_X = Xp[i][:,[1,0]]
- Xp[i] = np.concatenate([Xp[i], op_X], axis=0)
- return self.pwg_.transform(Xp)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, metric="persistence_scale_space", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx)
class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
"""
@@ -179,7 +214,7 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
kernel_approx (class): kernel approximation class used to speed up computation (default None). Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
"""
self.bandwidth = bandwidth
- self.pf_ = PersistenceFisherDistance(bandwidth=bandwidth_fisher, kernel_approx=kernel_approx)
+ self.bandwidth_fisher, self.kernel_approx = bandwidth_fisher, kernel_approx
def fit(self, X, y=None):
"""
@@ -189,7 +224,7 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
X (list of n x 2 numpy arrays): input persistence diagrams.
y (n x 1 array): persistence diagram labels (unused).
"""
- self.pf_.fit(X, y)
+ self.diagrams_ = X
return self
def transform(self, X):
@@ -202,5 +237,5 @@ class PersistenceFisherKernel(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence Fisher kernel values.
"""
- return np.exp(-self.pf_.transform(X)/self.bandwidth)
+ return pairwise_persistence_diagram_kernels(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, bandwidth_fisher=self.bandwidth_fisher, kernel_approx=self.kernel_approx)
diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py
index 290c1d07..cc788994 100644
--- a/src/python/gudhi/representations/metrics.py
+++ b/src/python/gudhi/representations/metrics.py
@@ -11,6 +11,8 @@ import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import pairwise_distances
from gudhi.wasserstein import wasserstein_distance
+from .preprocessing import Padding
+
try:
from .. import bottleneck_distance
USE_GUDHI = True
@@ -22,6 +24,108 @@ except ImportError:
# Metrics ###################################
#############################################
+def sliced_wasserstein_distance(D1, D2, num_directions):
+ """
+ This is a function for computing the sliced Wasserstein distance from two persistence diagrams. The Sliced Wasserstein distance is computed by projecting the persistence diagrams onto lines, comparing the projections with the 1-norm, and finally integrating over all possible lines. See http://proceedings.mlr.press/v70/carriere17a.html for more details.
+ :param D1: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points (i.e. with infinite coordinate).
+ :param D2: (m x 2) numpy.array encoding the second diagram.
+ :param num_directions: number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the distance computation.
+ :returns: the sliced Wasserstein distance between persistence diagrams.
+ :rtype: float
+ """
+ thetas = np.linspace(-np.pi/2, np.pi/2, num=num_directions+1)[np.newaxis,:-1]
+ lines = np.concatenate([np.cos(thetas), np.sin(thetas)], axis=0)
+ approx1 = np.matmul(D1, lines)
+ diag_proj1 = (1./2) * np.ones((2,2))
+ approx_diag1 = np.matmul(np.matmul(D1, diag_proj1), lines)
+ approx2 = np.matmul(D2, lines)
+ diag_proj2 = (1./2) * np.ones((2,2))
+ approx_diag2 = np.matmul(np.matmul(D2, diag_proj2), lines)
+ A = np.sort(np.concatenate([approx1, approx_diag2], axis=0), axis=0)
+ B = np.sort(np.concatenate([approx2, approx_diag1], axis=0), axis=0)
+ L1 = np.sum(np.abs(A-B), axis=0)
+ return np.mean(L1)
+
+def persistence_fisher_distance(D1, D2, kernel_approx=None, bandwidth=1.):
+ """
+ This is a function for computing the persistence Fisher distance from two persistence diagrams. The persistence Fisher distance is obtained by computing the original Fisher distance between the probability distributions associated to the persistence diagrams given by convolving them with a Gaussian kernel. See http://papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details.
+ :param D1: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points (i.e. with infinite coordinate).
+ :param D2: (m x 2) numpy.array encoding the second diagram.
+ :param bandwidth: bandwidth of the Gaussian kernel used to turn persistence diagrams into probability distributions.
+ :param kernel_approx: kernel approximation class used to speed up computation. Common kernel approximations classes can be found in the scikit-learn library (such as RBFSampler for instance).
+ :returns: the persistence Fisher distance between persistence diagrams.
+ :rtype: float
+ """
+ projection = (1./2) * np.ones((2,2))
+ diagonal_projections1 = np.matmul(D1, projection)
+ diagonal_projections2 = np.matmul(D2, projection)
+ if kernel_approx is not None:
+ approx1 = kernel_approx.transform(D1)
+ approx_diagonal1 = kernel_approx.transform(diagonal_projections1)
+ approx2 = kernel_approx.transform(D2)
+ approx_diagonal2 = kernel_approx.transform(diagonal_projections2)
+ Z = np.concatenate([approx1, approx_diagonal1, approx2, approx_diagonal2], axis=0)
+ U, V = np.sum(np.concatenate([approx1, approx_diagonal2], axis=0), axis=0), np.sum(np.concatenate([approx2, approx_diagonal1], axis=0), axis=0)
+ vectori, vectorj = np.abs(np.matmul(Z, U.T)), np.abs(np.matmul(Z, V.T))
+ vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+ if vectori_sum != 0:
+ vectori = vectori/vectori_sum
+ if vectorj_sum != 0:
+ vectorj = vectorj/vectorj_sum
+ return np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
+ else:
+ Z = np.concatenate([D1, diagonal_projections1, D2, diagonal_projections2], axis=0)
+ U, V = np.concatenate([D1, diagonal_projections2], axis=0), np.concatenate([D2, diagonal_projections1], axis=0)
+ vectori = np.sum(np.exp(-np.square(pairwise_distances(Z,U))/(2 * np.square(bandwidth)))/(bandwidth * np.sqrt(2*np.pi)), axis=1)
+ vectorj = np.sum(np.exp(-np.square(pairwise_distances(Z,V))/(2 * np.square(bandwidth)))/(bandwidth * np.sqrt(2*np.pi)), axis=1)
+ vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+ if vectori_sum != 0:
+ vectori = vectori/vectori_sum
+ if vectorj_sum != 0:
+ vectorj = vectorj/vectorj_sum
+ return np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
+
+def sklearn_wrapper(metric, **kwargs):
+ """
+ This function is a wrapper for any metric between two persistence diagrams that takes two numpy arrays of shapes (nx2) and (mx2) as arguments. It turns the metric into another that takes flattened and padded diagrams as inputs.
+ """
+ def flat_metric(D1, D2):
+ DD1, DD2 = np.reshape(D1, [-1,3]), np.reshape(D2, [-1,3])
+ return metric(DD1[DD1[:,2]==1,0:2], DD2[DD2[:,2]==1,0:2], **kwargs)
+ return flat_metric
+
+def pairwise_persistence_diagram_distances(X, Y=None, metric="bottleneck", **kwargs):
+ """
+ This function computes the distance matrix between two lists of persistence diagrams given as numpy arrays of shape (nx2).
+ :param X: first list of persistence diagrams.
+ :param Y: second list of persistence diagrams (optional). If None, pairwise distances are computed from the first list only.
+ :param metric: distance to use. It can be either a string ("sliced_wasserstein", "wasserstein", "bottleneck", "persistence_fisher") or a function taking two numpy arrays of shape (nx2) and (mx2) as inputs.
+ :returns: distance matrix, i.e., numpy array of shape (num diagrams 1 x num diagrams 2)
+ :rtype: float
+ """
+ if Y is None:
+ YY = None
+ pX = Padding(use=True).fit_transform(X)
+ diag_len = len(pX[0])
+ XX = np.reshape(np.vstack(pX), [-1, diag_len*3])
+ else:
+ nX, nY = len(X), len(Y)
+ pD = Padding(use=True).fit_transform(X + Y)
+ diag_len = len(pD[0])
+ XX = np.reshape(np.vstack(pD[:nX]), [-1, diag_len*3])
+ YY = np.reshape(np.vstack(pD[nX:]), [-1, diag_len*3])
+
+ if metric == "bottleneck":
+ return pairwise_distances(XX, YY, metric=sklearn_wrapper(bottleneck_distance, **kwargs))
+ elif metric == "wasserstein":
+ return pairwise_distances(XX, YY, metric=sklearn_wrapper(wasserstein_distance, **kwargs))
+ elif metric == "sliced_wasserstein":
+ return pairwise_distances(XX, YY, metric=sklearn_wrapper(sliced_wasserstein_distance, **kwargs))
+ elif metric == "persistence_fisher":
+ return pairwise_distances(XX, YY, metric=sklearn_wrapper(persistence_fisher_distance, **kwargs))
+ else:
+ return pairwise_distances(XX, YY, metric=sklearn_wrapper(metric, **kwargs))
+
class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
"""
This is a class for computing the sliced Wasserstein distance matrix from a list of persistence diagrams. The Sliced Wasserstein distance is computed by projecting the persistence diagrams onto lines, comparing the projections with the 1-norm, and finally integrating over all possible lines. See http://proceedings.mlr.press/v70/carriere17a.html for more details.
@@ -34,8 +138,6 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
num_directions (int): number of lines evenly sampled from [-pi/2,pi/2] in order to approximate and speed up the distance computation (default 10).
"""
self.num_directions = num_directions
- thetas = np.linspace(-np.pi/2, np.pi/2, num=self.num_directions+1)[np.newaxis,:-1]
- self.lines_ = np.concatenate([np.cos(thetas), np.sin(thetas)], axis=0)
def fit(self, X, y=None):
"""
@@ -46,9 +148,6 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
y (n x 1 array): persistence diagram labels (unused).
"""
self.diagrams_ = X
- self.approx_ = [np.matmul(X[i], self.lines_) for i in range(len(X))]
- diag_proj = (1./2) * np.ones((2,2))
- self.approx_diag_ = [np.matmul(np.matmul(X[i], diag_proj), self.lines_) for i in range(len(X))]
return self
def transform(self, X):
@@ -61,27 +160,7 @@ class SlicedWassersteinDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise sliced Wasserstein distances.
"""
- Xfit = np.zeros((len(X), len(self.approx_)))
- if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]):
- for i in range(len(self.approx_)):
- for j in range(i+1, len(self.approx_)):
- A = np.sort(np.concatenate([self.approx_[i], self.approx_diag_[j]], axis=0), axis=0)
- B = np.sort(np.concatenate([self.approx_[j], self.approx_diag_[i]], axis=0), axis=0)
- L1 = np.sum(np.abs(A-B), axis=0)
- Xfit[i,j] = np.mean(L1)
- Xfit[j,i] = Xfit[i,j]
- else:
- diag_proj = (1./2) * np.ones((2,2))
- approx = [np.matmul(X[i], self.lines_) for i in range(len(X))]
- approx_diag = [np.matmul(np.matmul(X[i], diag_proj), self.lines_) for i in range(len(X))]
- for i in range(len(approx)):
- for j in range(len(self.approx_)):
- A = np.sort(np.concatenate([approx[i], self.approx_diag_[j]], axis=0), axis=0)
- B = np.sort(np.concatenate([self.approx_[j], approx_diag[i]], axis=0), axis=0)
- L1 = np.sum(np.abs(A-B), axis=0)
- Xfit[i,j] = np.mean(L1)
-
- return Xfit
+ return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="sliced_wasserstein", num_directions=self.num_directions)
class BottleneckDistance(BaseEstimator, TransformerMixin):
"""
@@ -117,33 +196,9 @@ class BottleneckDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise bottleneck distances.
"""
- num_diag1 = len(X)
-
- #if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]):
- if X is self.diagrams_:
- matrix = np.zeros((num_diag1, num_diag1))
-
- if USE_GUDHI:
- for i in range(num_diag1):
- for j in range(i+1, num_diag1):
- matrix[i,j] = bottleneck_distance(X[i], X[j], self.epsilon)
- matrix[j,i] = matrix[i,j]
- else:
- print("Gudhi built without CGAL: returning a null matrix")
-
- else:
- num_diag2 = len(self.diagrams_)
- matrix = np.zeros((num_diag1, num_diag2))
-
- if USE_GUDHI:
- for i in range(num_diag1):
- for j in range(num_diag2):
- matrix[i,j] = bottleneck_distance(X[i], self.diagrams_[j], self.epsilon)
- else:
- print("Gudhi built without CGAL: returning a null matrix")
-
- Xfit = matrix
-
+ if not USE_GUDHI:
+ print("Gudhi built without CGAL: returning a null matrix")
+ Xfit = pairwise_persistence_diagram_distances(X, self.diagrams_, metric="bottleneck", e=self.epsilon) if USE_GUDHI else np.zeros((len(X), len(self.diagrams_)))
return Xfit
class WassersteinDistance(BaseEstimator, TransformerMixin):
@@ -181,28 +236,7 @@ class WassersteinDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise Wasserstein distances.
"""
- num_diag1 = len(X)
-
- #if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]):
- if X is self.diagrams_:
- matrix = np.zeros((num_diag1, num_diag1))
-
- for i in range(num_diag1):
- for j in range(i+1, num_diag1):
- matrix[i,j] = wasserstein_distance(X[i], X[j], self.order, self.internal_p)
- matrix[j,i] = matrix[i,j]
-
- else:
- num_diag2 = len(self.diagrams_)
- matrix = np.zeros((num_diag1, num_diag2))
-
- for i in range(num_diag1):
- for j in range(num_diag2):
- matrix[i,j] = wasserstein_distance(X[i], self.diagrams_[j], self.order, self.internal_p)
-
- Xfit = matrix
-
- return Xfit
+ return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="wasserstein", order=self.order, internal_p=self.internal_p)
class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
"""
@@ -227,11 +261,6 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
y (n x 1 array): persistence diagram labels (unused).
"""
self.diagrams_ = X
- projection = (1./2) * np.ones((2,2))
- self.diagonal_projections_ = [np.matmul(X[i], projection) for i in range(len(X))]
- if self.kernel_approx is not None:
- self.approx_ = [self.kernel_approx.transform(X[i]) for i in range(len(X))]
- self.approx_diagonal_ = [self.kernel_approx.transform(self.diagonal_projections_[i]) for i in range(len(X))]
return self
def transform(self, X):
@@ -244,60 +273,4 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
Returns:
numpy array of shape (number of diagrams in **diagrams**) x (number of diagrams in X): matrix of pairwise persistence Fisher distances.
"""
- Xfit = np.zeros((len(X), len(self.diagrams_)))
- if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]):
- for i in range(len(self.diagrams_)):
- for j in range(i+1, len(self.diagrams_)):
- if self.kernel_approx is not None:
- Z = np.concatenate([self.approx_[i], self.approx_diagonal_[i], self.approx_[j], self.approx_diagonal_[j]], axis=0)
- U, V = np.sum(np.concatenate([self.approx_[i], self.approx_diagonal_[j]], axis=0), axis=0), np.sum(np.concatenate([self.approx_[j], self.approx_diagonal_[i]], axis=0), axis=0)
- vectori, vectorj = np.abs(np.matmul(Z, U.T)), np.abs(np.matmul(Z, V.T))
- vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
- if vectori_sum != 0:
- vectori = vectori/vectori_sum
- if vectorj_sum != 0:
- vectorj = vectorj/vectorj_sum
- Xfit[i,j] = np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
- Xfit[j,i] = Xfit[i,j]
- else:
- Z = np.concatenate([self.diagrams_[i], self.diagonal_projections_[i], self.diagrams_[j], self.diagonal_projections_[j]], axis=0)
- U, V = np.concatenate([self.diagrams_[i], self.diagonal_projections_[j]], axis=0), np.concatenate([self.diagrams_[j], self.diagonal_projections_[i]], axis=0)
- vectori = np.sum(np.exp(-np.square(pairwise_distances(Z,U))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
- vectorj = np.sum(np.exp(-np.square(pairwise_distances(Z,V))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
- vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
- if vectori_sum != 0:
- vectori = vectori/vectori_sum
- if vectorj_sum != 0:
- vectorj = vectorj/vectorj_sum
- Xfit[i,j] = np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
- Xfit[j,i] = Xfit[i,j]
- else:
- projection = (1./2) * np.ones((2,2))
- diagonal_projections = [np.matmul(X[i], projection) for i in range(len(X))]
- if self.kernel_approx is not None:
- approx = [self.kernel_approx.transform(X[i]) for i in range(len(X))]
- approx_diagonal = [self.kernel_approx.transform(diagonal_projections[i]) for i in range(len(X))]
- for i in range(len(X)):
- for j in range(len(self.diagrams_)):
- if self.kernel_approx is not None:
- Z = np.concatenate([approx[i], approx_diagonal[i], self.approx_[j], self.approx_diagonal_[j]], axis=0)
- U, V = np.sum(np.concatenate([approx[i], self.approx_diagonal_[j]], axis=0), axis=0), np.sum(np.concatenate([self.approx_[j], approx_diagonal[i]], axis=0), axis=0)
- vectori, vectorj = np.abs(np.matmul(Z, U.T)), np.abs(np.matmul(Z, V.T))
- vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
- if vectori_sum != 0:
- vectori = vectori/vectori_sum
- if vectorj_sum != 0:
- vectorj = vectorj/vectorj_sum
- Xfit[i,j] = np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
- else:
- Z = np.concatenate([X[i], diagonal_projections[i], self.diagrams_[j], self.diagonal_projections_[j]], axis=0)
- U, V = np.concatenate([X[i], self.diagonal_projections_[j]], axis=0), np.concatenate([self.diagrams_[j], diagonal_projections[i]], axis=0)
- vectori = np.sum(np.exp(-np.square(pairwise_distances(Z,U))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
- vectorj = np.sum(np.exp(-np.square(pairwise_distances(Z,V))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
- vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
- if vectori_sum != 0:
- vectori = vectori/vectori_sum
- if vectorj_sum != 0:
- vectorj = vectorj/vectorj_sum
- Xfit[i,j] = np.arccos( min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.) )
- return Xfit
+ return pairwise_persistence_diagram_distances(X, self.diagrams_, metric="persistence_fisher", bandwidth=self.bandwidth, kernel_approx=self.kernel_approx)