From 7ea7538d02b8f0500dbc31f48dd31fb14d320135 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 20 Nov 2019 20:30:36 +0100 Subject: Minor tweaks to representations uniform -> even matmul -> indexing min+max -> clip --- src/python/gudhi/representations/kernel_methods.py | 2 +- src/python/gudhi/representations/preprocessing.py | 2 +- src/python/gudhi/representations/vector_methods.py | 34 +++++++++++----------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/python/gudhi/representations/kernel_methods.py b/src/python/gudhi/representations/kernel_methods.py index c855d2be..bfc83aff 100644 --- a/src/python/gudhi/representations/kernel_methods.py +++ b/src/python/gudhi/representations/kernel_methods.py @@ -161,7 +161,7 @@ class PersistenceScaleSpaceKernel(BaseEstimator, TransformerMixin): """ Xp = list(X) for i in range(len(Xp)): - op_X = np.matmul(Xp[i], np.array([[0.,1.], [1.,0.]])) + op_X = Xp[i][:,[1,0]] Xp[i] = np.concatenate([Xp[i], op_X], axis=0) return self.pwg_.transform(Xp) diff --git a/src/python/gudhi/representations/preprocessing.py b/src/python/gudhi/representations/preprocessing.py index 83227ca1..487b5376 100644 --- a/src/python/gudhi/representations/preprocessing.py +++ b/src/python/gudhi/representations/preprocessing.py @@ -30,7 +30,7 @@ class BirthPersistenceTransform(BaseEstimator, TransformerMixin): Fit the BirthPersistenceTransform class on a list of persistence diagrams (this function actually does nothing but is useful when BirthPersistenceTransform is included in a scikit-learn Pipeline). Parameters: - X (n x 2 numpy array): input persistence diagrams. + X (list of n x 2 numpy array): input persistence diagrams. y (n x 1 array): persistence diagram labels (unused). """ return self diff --git a/src/python/gudhi/representations/vector_methods.py b/src/python/gudhi/representations/vector_methods.py index bf32f18e..61c4fb84 100644 --- a/src/python/gudhi/representations/vector_methods.py +++ b/src/python/gudhi/representations/vector_methods.py @@ -83,7 +83,7 @@ class PersistenceImage(BaseEstimator, TransformerMixin): class Landscape(BaseEstimator, TransformerMixin): """ - This is a class for computing persistence landscapes from a list of persistence diagrams. A persistence landscape is a collection of 1D piecewise-linear functions computed from the rank function associated to the persistence diagram. These piecewise-linear functions are then sampled uniformly on a given range and the corresponding vectors of samples are concatenated and returned. See http://jmlr.org/papers/v16/bubenik15a.html for more details. + This is a class for computing persistence landscapes from a list of persistence diagrams. A persistence landscape is a collection of 1D piecewise-linear functions computed from the rank function associated to the persistence diagram. These piecewise-linear functions are then sampled evenly on a given range and the corresponding vectors of samples are concatenated and returned. See http://jmlr.org/papers/v16/bubenik15a.html for more details. """ def __init__(self, num_landscapes=5, resolution=100, sample_range=[np.nan, np.nan]): """ @@ -92,7 +92,7 @@ class Landscape(BaseEstimator, TransformerMixin): Parameters: num_landscapes (int): number of piecewise-linear functions to output (default 5). resolution (int): number of sample for all piecewise-linear functions (default 100). - sample_range ([double, double]): minimum and maximum of all piecewise-linear function domains, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. + sample_range ([double, double]): minimum and maximum of all piecewise-linear function domains, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. """ self.num_landscapes, self.resolution, self.sample_range = num_landscapes, resolution, sample_range @@ -136,9 +136,9 @@ class Landscape(BaseEstimator, TransformerMixin): for j in range(num_pts_in_diag): [px,py] = diagram[j,:2] - min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) + min_idx = np.clip(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + mid_idx = np.clip(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + max_idx = np.clip(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) if min_idx < self.resolution and max_idx > 0: @@ -165,7 +165,7 @@ class Landscape(BaseEstimator, TransformerMixin): class Silhouette(BaseEstimator, TransformerMixin): """ - This is a class for computing persistence silhouettes from a list of persistence diagrams. A persistence silhouette is computed by taking a weighted average of the collection of 1D piecewise-linear functions given by the persistence landscapes, and then by uniformly sampling this average on a given range. Finally, the corresponding vector of samples is returned. See https://arxiv.org/abs/1312.0308 for more details. + This is a class for computing persistence silhouettes from a list of persistence diagrams. A persistence silhouette is computed by taking a weighted average of the collection of 1D piecewise-linear functions given by the persistence landscapes, and then by evenly sampling this average on a given range. Finally, the corresponding vector of samples is returned. See https://arxiv.org/abs/1312.0308 for more details. """ def __init__(self, weight=lambda x: 1, resolution=100, sample_range=[np.nan, np.nan]): """ @@ -174,7 +174,7 @@ class Silhouette(BaseEstimator, TransformerMixin): Parameters: weight (function): weight function for the persistence diagram points (default constant function, ie lambda x: 1). This function must be defined on 2D points, ie on lists or numpy arrays of the form [p_x,p_y]. resolution (int): number of samples for the weighted average (default 100). - sample_range ([double, double]): minimum and maximum for the weighted average domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. + sample_range ([double, double]): minimum and maximum for the weighted average domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. """ self.weight, self.resolution, self.sample_range = weight, resolution, sample_range @@ -219,9 +219,9 @@ class Silhouette(BaseEstimator, TransformerMixin): [px,py] = diagram[j,:2] weight = weights[j] / total_weight - min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) + min_idx = np.clip(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + mid_idx = np.clip(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + max_idx = np.clip(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) if min_idx < self.resolution and max_idx > 0: @@ -243,7 +243,7 @@ class Silhouette(BaseEstimator, TransformerMixin): class BettiCurve(BaseEstimator, TransformerMixin): """ - This is a class for computing Betti curves from a list of persistence diagrams. A Betti curve is a 1D piecewise-constant function obtained from the rank function. It is sampled uniformly on a given range and the vector of samples is returned. See https://www.researchgate.net/publication/316604237_Time_Series_Classification_via_Topological_Data_Analysis for more details. + This is a class for computing Betti curves from a list of persistence diagrams. A Betti curve is a 1D piecewise-constant function obtained from the rank function. It is sampled evenly on a given range and the vector of samples is returned. See https://www.researchgate.net/publication/316604237_Time_Series_Classification_via_Topological_Data_Analysis for more details. """ def __init__(self, resolution=100, sample_range=[np.nan, np.nan]): """ @@ -251,7 +251,7 @@ class BettiCurve(BaseEstimator, TransformerMixin): Parameters: resolution (int): number of sample for the piecewise-constant function (default 100). - sample_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. + sample_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. """ self.resolution, self.sample_range = resolution, sample_range @@ -290,8 +290,8 @@ class BettiCurve(BaseEstimator, TransformerMixin): bc = np.zeros(self.resolution) for j in range(num_pts_in_diag): [px,py] = diagram[j,:2] - min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) + min_idx = np.clip(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + max_idx = np.clip(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) for k in range(min_idx, max_idx): bc[k] += 1 @@ -313,7 +313,7 @@ class Entropy(BaseEstimator, TransformerMixin): mode (string): what entropy to compute: either "scalar" for computing the entropy statistics, or "vector" for computing the entropy summary functions (default "scalar"). normalized (bool): whether to normalize the entropy summary function (default True). Used only if **mode** = "vector". resolution (int): number of sample for the entropy summary function (default 100). Used only if **mode** = "vector". - sample_range ([double, double]): minimum and maximum of the entropy summary function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. Used only if **mode** = "vector". + sample_range ([double, double]): minimum and maximum of the entropy summary function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. Used only if **mode** = "vector". """ self.mode, self.normalized, self.resolution, self.sample_range = mode, normalized, resolution, sample_range @@ -359,8 +359,8 @@ class Entropy(BaseEstimator, TransformerMixin): ent = np.zeros(self.resolution) for j in range(num_pts_in_diag): [px,py] = orig_diagram[j,:2] - min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) - max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution) + min_idx = np.clip(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) + max_idx = np.clip(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0, self.resolution) for k in range(min_idx, max_idx): ent[k] += (-1) * new_diagram[j,1] * np.log(new_diagram[j,1]) if self.normalized: -- cgit v1.2.3 From 96c90dd002e7d4f30ce3d0f3ddf45fbfd03fc01a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 20 Nov 2019 22:41:54 +0100 Subject: More tweaks in representations wording for the case where CGAL is missing comparing pointers is sufficient to detect if we are doing fit_transform epsilon is an additive error for bottleneck, so it is unsafe to use 1e-3 or 1e-4 as default --- src/python/gudhi/representations/metrics.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index c512cb82..f9f59053 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -86,12 +86,12 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): """ This is a class for computing the bottleneck distance matrix from a list of persistence diagrams. """ - def __init__(self, epsilon=1e-3): + def __init__(self, epsilon=None): """ Constructor for the BottleneckDistance class. Parameters: - epsilon (double): approximation quality (default 1e-4). + epsilon (double): absolute (additive) error tolerated on the distance (default is the smallest positive float), see :func:`bottleneck_distance`. """ self.epsilon = epsilon @@ -118,7 +118,8 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): """ num_diag1 = len(X) - if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]): + #if len(self.diagrams_) == len(X) and np.all([np.array_equal(self.diagrams_[i], X[i]) for i in range(len(X))]): + if X is self.diagrams_: matrix = np.zeros((num_diag1, num_diag1)) if USE_GUDHI: @@ -127,7 +128,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): matrix[i,j] = bottleneck_distance(X[i], X[j], self.epsilon) matrix[j,i] = matrix[i,j] else: - print("Gudhi required---returning null matrix") + print("Gudhi built without CGAL: returning a null matrix") else: num_diag2 = len(self.diagrams_) @@ -138,7 +139,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): for j in range(num_diag2): matrix[i,j] = bottleneck_distance(X[i], self.diagrams_[j], self.epsilon) else: - print("Gudhi required---returning null matrix") + print("Gudhi built without CGAL: returning a null matrix") Xfit = matrix -- cgit v1.2.3 From 84299342910d18aeaa1a605fe224bede13d647dc Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 21 Nov 2019 22:50:05 +0100 Subject: Generalize Clamping to min and max Fix #146. --- src/python/example/diagram_vectorizations_distances_kernels.py | 2 +- src/python/gudhi/representations/preprocessing.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/example/diagram_vectorizations_distances_kernels.py b/src/python/example/diagram_vectorizations_distances_kernels.py index a77bbfdd..119072eb 100755 --- a/src/python/example/diagram_vectorizations_distances_kernels.py +++ b/src/python/example/diagram_vectorizations_distances_kernels.py @@ -16,7 +16,7 @@ diags = [D] diags = DiagramSelector(use=True, point_type="finite").fit_transform(diags) diags = DiagramScaler(use=True, scalers=[([0,1], MinMaxScaler())]).fit_transform(diags) -diags = DiagramScaler(use=True, scalers=[([1], Clamping(limit=.9))]).fit_transform(diags) +diags = DiagramScaler(use=True, scalers=[([1], Clamping(maximum=.9))]).fit_transform(diags) D = diags[0] plt.scatter(D[:,0],D[:,1]) diff --git a/src/python/gudhi/representations/preprocessing.py b/src/python/gudhi/representations/preprocessing.py index 487b5376..a39b00e4 100644 --- a/src/python/gudhi/representations/preprocessing.py +++ b/src/python/gudhi/representations/preprocessing.py @@ -58,14 +58,15 @@ class Clamping(BaseEstimator, TransformerMixin): """ This is a class for clamping values. It can be used as a parameter for the DiagramScaler class, for instance if you want to clamp abscissae or ordinates of persistence diagrams. """ - def __init__(self, limit=np.inf): + def __init__(self, minimum=-np.inf, maximum=np.inf): """ Constructor for the Clamping class. Parameters: limit (double): clamping value (default np.inf). """ - self.limit = limit + self.minimum = minimum + self.maximum = maximum def fit(self, X, y=None): """ @@ -87,8 +88,7 @@ class Clamping(BaseEstimator, TransformerMixin): Returns: numpy array of size n: output list of values. """ - Xfit = np.minimum(X, self.limit) - #Xfit = np.where(X >= self.limit, self.limit * np.ones(X.shape), X) + Xfit = np.clip(X, self.minimum, self.maximum) return Xfit class DiagramScaler(BaseEstimator, TransformerMixin): -- cgit v1.2.3 From da22cbc891b2b7a9b326e3840533b4d3b49a26d3 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 22 Nov 2019 11:50:34 +0100 Subject: Fix link to bottleneck_distance. --- src/python/gudhi/representations/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/gudhi/representations/metrics.py b/src/python/gudhi/representations/metrics.py index f9f59053..5f9ec6ab 100644 --- a/src/python/gudhi/representations/metrics.py +++ b/src/python/gudhi/representations/metrics.py @@ -91,7 +91,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin): Constructor for the BottleneckDistance class. Parameters: - epsilon (double): absolute (additive) error tolerated on the distance (default is the smallest positive float), see :func:`bottleneck_distance`. + epsilon (double): absolute (additive) error tolerated on the distance (default is the smallest positive float), see :func:`gudhi.bottleneck_distance`. """ self.epsilon = epsilon -- cgit v1.2.3 From 177e80b653d60119acb4455feaba02615083532b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 Nov 2019 17:51:50 +0100 Subject: Fix link. --- src/python/doc/representations.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/doc/representations.rst b/src/python/doc/representations.rst index a137a035..c870f834 100644 --- a/src/python/doc/representations.rst +++ b/src/python/doc/representations.rst @@ -10,7 +10,7 @@ Representations manual This module, originally named sklearn_tda, aims at bridging the gap between persistence diagrams and machine learning tools, in particular scikit-learn. It provides tools, using the scikit-learn standard interface, to compute distances and kernels on diagrams, and to convert diagrams into vectors. -A diagram is represented as a numpy array of shape (n,2), as can be obtained from `SimplexTree.persistence_intervals_in_dimension` for instance. Points at infinity are represented as a numpy array of shape (n,1), storing only the birth time. +A diagram is represented as a numpy array of shape (n,2), as can be obtained from :func:`gudhi.SimplexTree.persistence_intervals_in_dimension` for instance. Points at infinity are represented as a numpy array of shape (n,1), storing only the birth time. A small example is provided -- cgit v1.2.3