Merge remote-tracking branch 'upstream/master' into fetch_datasets

author: Hind-M <hind.montassif@gmail.com> 2022-01-31 11:15:54 +0100
committer: Hind-M <hind.montassif@gmail.com> 2022-01-31 11:15:54 +0100
commit: 1209db091a89ed48527c34fff0cc9ef41c78d11f (patch)
tree: e34a11454579e27ad3c711b50a46c5c9670a3e54 /src/python/gudhi
parent: 8d1e7aeb3416194d00f45587d1ecea85ba218028 (diff)
parent: 7f1b8eb706c72921141b53e607d6e2aa28e2bf19 (diff)
2 files changed, 162 insertions, 59 deletions
diff --git a/src/python/gudhi/alpha_complex.pyx b/src/python/gudhi/alpha_complex.pyx
index ea128743..a4888914 100644
--- a/src/python/gudhi/alpha_complex.pyx
+++ b/src/python/gudhi/alpha_complex.pyx
@@ -16,7 +16,7 @@ from libcpp.utility cimport pair
 from libcpp.string cimport string
 from libcpp cimport bool
 from libc.stdint cimport intptr_t
-import os
+import warnings
 
 from gudhi.simplex_tree cimport *
 from gudhi.simplex_tree import SimplexTree
@@ -28,66 +28,72 @@ __license__ = "GPL v3"
 
 cdef extern from "Alpha_complex_interface.h" namespace "Gudhi":
     cdef cppclass Alpha_complex_interface "Gudhi::alpha_complex::Alpha_complex_interface":
-        Alpha_complex_interface(vector[vector[double]] points, bool fast_version, bool exact_version) nogil except +
+        Alpha_complex_interface(vector[vector[double]] points, vector[double] weights, bool fast_version, bool exact_version) nogil except +
         vector[double] get_point(int vertex) nogil except +
         void create_simplex_tree(Simplex_tree_interface_full_featured* simplex_tree, double max_alpha_square, bool default_filtration_value) nogil except +
 
 # AlphaComplex python interface
 cdef class AlphaComplex:
-    """AlphaComplex is a simplicial complex constructed from the finite cells
-    of a Delaunay Triangulation.
+    """AlphaComplex is a simplicial complex constructed from the finite cells of a Delaunay Triangulation.
 
-    The filtration value of each simplex is computed as the square of the
-    circumradius of the simplex if the circumsphere is empty (the simplex is
-    then said to be Gabriel), and as the minimum of the filtration values of
-    the codimension 1 cofaces that make it not Gabriel otherwise.
+    The filtration value of each simplex is computed as the square of the circumradius of the simplex if the
+    circumsphere is empty (the simplex is then said to be Gabriel), and as the minimum of the filtration values of the
+    codimension 1 cofaces that make it not Gabriel otherwise.
 
-    All simplices that have a filtration value strictly greater than a given
-    alpha squared value are not inserted into the complex.
+    All simplices that have a filtration value strictly greater than a given alpha squared value are not inserted into
+    the complex.
 
     .. note::
 
-        When Alpha_complex is constructed with an infinite value of alpha, the
-        complex is a Delaunay complex.
-
+        When Alpha_complex is constructed with an infinite value of alpha, the complex is a Delaunay complex.
     """
 
     cdef Alpha_complex_interface * this_ptr
 
     # Fake constructor that does nothing but documenting the constructor
-    def __init__(self, points=None, off_file='', precision='safe'):
+    def __init__(self, points=[], off_file='', weights=None, precision='safe'):
         """AlphaComplex constructor.
 
         :param points: A list of points in d-Dimension.
-        :type points: list of list of double
-
-        Or
+        :type points: Iterable[Iterable[float]]
 
-        :param off_file: An OFF file style name.
+        :param off_file: **[deprecated]** An `OFF file style <fileformats.html#off-file-format>`_ name.
+            If an `off_file` is given with `points` as arguments, only points from the file are taken into account.
         :type off_file: string
 
+        :param weights: A list of weights. If set, the number of weights must correspond to the number of points.
+        :type weights: Iterable[float]
+
         :param precision: Alpha complex precision can be 'fast', 'safe' or 'exact'. Default is 'safe'.
         :type precision: string
+
+        :raises FileNotFoundError: **[deprecated]** If `off_file` is set but not found.
+        :raises ValueError: In case of inconsistency between the number of points and weights.
         """
 
     # The real cython constructor
-    def __cinit__(self, points = None, off_file = '', precision = 'safe'):
+    def __cinit__(self, points = [], off_file = '', weights=None, precision = 'safe'):
         assert precision in ['fast', 'safe', 'exact'], "Alpha complex precision can only be 'fast', 'safe' or 'exact'"
         cdef bool fast = precision == 'fast'
         cdef bool exact = precision == 'exact'
 
-        cdef vector[vector[double]] pts
         if off_file:
-            if os.path.isfile(off_file):
-                points = read_points_from_off_file(off_file = off_file)
-            else:
-                print("file " + off_file + " not found.")
-        if points is None:
-            # Empty Alpha construction
-            points=[]
+            warnings.warn("off_file is a deprecated parameter, please consider using gudhi.read_points_from_off_file",
+                          DeprecationWarning)
+            points = read_points_from_off_file(off_file = off_file)
+
+        # weights are set but is inconsistent with the number of points
+        if weights != None and len(weights) != len(points):
+            raise ValueError("Inconsistency between the number of points and weights")
+
+        # need to copy the points to use them without the gil
+        cdef vector[vector[double]] pts
+        cdef vector[double] wgts
         pts = points
+        if weights != None:
+            wgts = weights
         with nogil:
-            self.this_ptr = new Alpha_complex_interface(pts, fast, exact)
+            self.this_ptr = new Alpha_complex_interface(pts, wgts, fast, exact)
 
     def __dealloc__(self):
         if self.this_ptr != NULL:
diff --git a/src/python/gudhi/representations/vector_methods.py b/src/python/gudhi/representations/vector_methods.py
index e883b5dd..f8078d03 100644
--- a/src/python/gudhi/representations/vector_methods.py
+++ b/src/python/gudhi/representations/vector_methods.py
@@ -1,15 +1,17 @@
 # This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
 # See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
-# Author(s):       Mathieu Carrière, Martin Royer
+# Author(s):       Mathieu Carrière, Martin Royer, Gard Spreemann
 #
 # Copyright (C) 2018-2020 Inria
 #
 # Modification(s):
 #   - 2020/06 Martin: ATOL integration
+#   - 2020/12 Gard: A more flexible Betti curve class capable of computing exact curves.
 #   - 2021/11 Vincent Rouvreau: factorize _automatic_sample_range
 
 import numpy as np
 from sklearn.base          import BaseEstimator, TransformerMixin
+from sklearn.exceptions    import NotFittedError
 from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
 from sklearn.neighbors     import DistanceMetric
 from sklearn.metrics       import pairwise
@@ -306,67 +308,162 @@ class Silhouette(BaseEstimator, TransformerMixin):
         """
         return self.fit_transform([diag])[0,:]
 
+
 class BettiCurve(BaseEstimator, TransformerMixin):
     """
-    This is a class for computing Betti curves from a list of persistence diagrams. A Betti curve is a 1D piecewise-constant function obtained from the rank function. It is sampled evenly on a given range and the vector of samples is returned. See https://www.researchgate.net/publication/316604237_Time_Series_Classification_via_Topological_Data_Analysis for more details.
+    Compute Betti curves from persistence diagrams. There are several modes of operation: with a given resolution (with or without a sample_range), with a predefined grid, and with none of the previous. With a predefined grid, the class computes the Betti numbers at those grid points. Without a predefined grid, if the resolution is set to None, it can be fit to a list of persistence diagrams and produce a grid that consists of (at least) the filtration values at which at least one of those persistence diagrams changes Betti numbers, and then compute the Betti numbers at those grid points. In the latter mode, the exact Betti curve is computed for the entire real line. Otherwise, if the resolution is given, the Betti curve is obtained by sampling evenly using either the given sample_range or based on the persistence diagrams.
     """
-    def __init__(self, resolution=100, sample_range=[np.nan, np.nan]):
+
+    def __init__(self, resolution=100, sample_range=[np.nan, np.nan], predefined_grid=None):
         """
         Constructor for the BettiCurve class.
 
         Parameters:
             resolution (int): number of sample for the piecewise-constant function (default 100).
             sample_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
+            predefined_grid (1d array or None, default=None): Predefined filtration grid points at which to compute the Betti curves. Must be strictly ordered. Infinities are ok. If None (default), and resolution is given, the grid will be uniform from x_min to x_max in 'resolution' steps, otherwise a grid will be computed that captures all changes in Betti numbers in the provided data.
+
+        Attributes:
+            grid_ (1d array): The grid on which the Betti numbers are computed. If predefined_grid was specified, `grid_` will always be that grid, independently of data. If not, the grid is fitted to capture all filtration values at which the Betti numbers change.
+
+        Examples
+        --------
+        If pd is a persistence diagram and xs is a nonempty grid of finite values such that xs[0] >= pd.min(), then the results of:
+
+        >>> bc = BettiCurve(predefined_grid=xs) # doctest: +SKIP
+        >>> result = bc(pd) # doctest: +SKIP
+
+        and
+
+        >>> from scipy.interpolate import interp1d # doctest: +SKIP
+        >>> bc = BettiCurve(resolution=None, predefined_grid=None) # doctest: +SKIP
+        >>> bettis = bc.fit_transform([pd]) # doctest: +SKIP
+        >>> interp = interp1d(bc.grid_, bettis[0, :], kind="previous", fill_value="extrapolate") # doctest: +SKIP
+        >>> result = np.array(interp(xs), dtype=int) # doctest: +SKIP
+
+        are the same.
         """
-        self.resolution, self.sample_range = resolution, sample_range
 
-    def fit(self, X, y=None):
+        if (predefined_grid is not None) and (not isinstance(predefined_grid, np.ndarray)):
+            raise ValueError("Expected predefined_grid as array or None.")
+
+        self.predefined_grid = predefined_grid
+        self.resolution = resolution
+        self.sample_range = sample_range
+
+    def is_fitted(self):
+        return hasattr(self, "grid_")
+
+    def fit(self, X, y = None):
         """
-        Fit the BettiCurve class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
+        Fit the BettiCurve class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams. When no predefined grid is provided and resolution set to None, compute a filtration grid that captures all changes in Betti numbers for all the given persistence diagrams.
 
         Parameters:
-            X (list of n x 2 numpy arrays): input persistence diagrams.
-            y (n x 1 array): persistence diagram labels (unused).
+            X (list of 2d arrays): Persistence diagrams.
+            y (None): Ignored.
         """
-        self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+
+        if self.predefined_grid is None:
+            if self.resolution is None: # Flexible/exact version
+                events = np.unique(np.concatenate([pd.flatten() for pd in X] + [[-np.inf]], axis=0))
+                self.grid_ = np.array(events)
+            else:
+                self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+                self.grid_ = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
+        else:
+            self.grid_ = self.predefined_grid # Get the predefined grid from user
+
         return self
 
     def transform(self, X):
         """
-        Compute the Betti curve for each persistence diagram individually and concatenate the results.
+        Compute Betti curves.
 
         Parameters:
-            X (list of n x 2 numpy arrays): input persistence diagrams.
-    
+            X (list of 2d arrays): Persistence diagrams.
+
         Returns:
-            numpy array with shape (number of diagrams) x (**resolution**): output Betti curves.
+            `len(X).len(self.grid_)` array of ints: Betti numbers of the given persistence diagrams at the grid points given in `self.grid_`
         """
-        Xfit = []
-        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
-        step_x = x_values[1] - x_values[0]
 
-        for diagram in X:
-            diagram_int = np.clip(np.ceil((diagram[:,:2] - self.sample_range[0]) / step_x), 0, self.resolution).astype(int)
-            bc =  np.zeros(self.resolution)
-            for interval in diagram_int:
-                bc[interval[0]:interval[1]] += 1
-            Xfit.append(np.reshape(bc,[1,-1]))
+        if not self.is_fitted():
+            raise NotFittedError("Not fitted.")
 
-        Xfit = np.concatenate(Xfit, 0)
+        if not X:
+            X = [np.zeros((0, 2))]
+        
+        N = len(X)
 
-        return Xfit
+        events = np.concatenate([pd.flatten(order="F") for pd in X], axis=0)
+        sorting = np.argsort(events)
+        offsets = np.zeros(1 + N, dtype=int)
+        for i in range(0, N):
+            offsets[i+1] = offsets[i] + 2*X[i].shape[0]
+        starts = offsets[0:N]
+        ends = offsets[1:N + 1] - 1
 
-    def __call__(self, diag):
+        bettis = [[0] for i in range(0, N)]
+
+        i = 0
+        for x in self.grid_:
+            while i < len(sorting) and events[sorting[i]] <= x:
+                j = np.searchsorted(ends, sorting[i])
+                delta = 1 if sorting[i] - starts[j] < len(X[j]) else -1
+                bettis[j][-1] += delta
+                i += 1
+            for k in range(0, N):
+                bettis[k].append(bettis[k][-1])
+
+        return np.array(bettis, dtype=int)[:, 0:-1]
+
+    def fit_transform(self, X):
+        """
+        The result is the same as fit(X) followed by transform(X), but potentially faster.
         """
-        Apply BettiCurve on a single persistence diagram and outputs the result.
 
-        Parameters:
-            diag (n x 2 numpy array): input persistence diagram.
+        if self.predefined_grid is None and self.resolution is None:
+            if not X:
+                X = [np.zeros((0, 2))]
 
-        Returns:
-            numpy array with shape (**resolution**): output Betti curve.
+            N = len(X)
+
+            events = np.concatenate([pd.flatten(order="F") for pd in X], axis=0)
+            sorting = np.argsort(events)
+            offsets = np.zeros(1 + N, dtype=int)
+            for i in range(0, N):
+                offsets[i+1] = offsets[i] + 2*X[i].shape[0]
+            starts = offsets[0:N]
+            ends = offsets[1:N + 1] - 1
+
+            xs = [-np.inf]
+            bettis = [[0] for i in range(0, N)]
+
+            for i in sorting:
+                j = np.searchsorted(ends, i)
+                delta = 1 if i - starts[j] < len(X[j]) else -1
+                if events[i] == xs[-1]:
+                    bettis[j][-1] += delta
+                else:
+                    xs.append(events[i])
+                    for k in range(0, j):
+                        bettis[k].append(bettis[k][-1])
+                    bettis[j].append(bettis[j][-1] + delta)
+                    for k in range(j+1, N):
+                        bettis[k].append(bettis[k][-1])
+
+            self.grid_ = np.array(xs)
+            return np.array(bettis, dtype=int)
+
+        else:
+            return self.fit(X).transform(X)
+
+    def __call__(self, diag):
         """
-        return self.fit_transform([diag])[0,:]
+        Shorthand for transform on a single persistence diagram.
+        """
+        return self.fit_transform([diag])[0, :]
+
+
 
 class Entropy(BaseEstimator, TransformerMixin):
     """
author	Hind-M <hind.montassif@gmail.com>	2022-01-31 11:15:54 +0100
committer	Hind-M <hind.montassif@gmail.com>	2022-01-31 11:15:54 +0100
commit	1209db091a89ed48527c34fff0cc9ef41c78d11f (patch)
tree	e34a11454579e27ad3c711b50a46c5c9670a3e54 /src/python/gudhi
parent	8d1e7aeb3416194d00f45587d1ecea85ba218028 (diff)
parent	7f1b8eb706c72921141b53e607d6e2aa28e2bf19 (diff)