4 files changed, 205 insertions, 42 deletions
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 4a017251..26b8b7d6 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -535,6 +535,11 @@ if(PYTHONINTERP_FOUND)
       add_gudhi_py_test(test_representations)
     endif()
 
+    # Betti curves.
+    if(SCIPY_FOUND)
+      add_gudhi_py_test(test_betti_curve_representations)
+    endif()
+
     # Time Delay
     add_gudhi_py_test(test_time_delay)
 
diff --git a/src/python/gudhi/representations/vector_methods.py b/src/python/gudhi/representations/vector_methods.py
index e883b5dd..018e9b21 100644
--- a/src/python/gudhi/representations/vector_methods.py
+++ b/src/python/gudhi/representations/vector_methods.py
@@ -1,15 +1,17 @@
 # This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
 # See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
-# Author(s):       Mathieu Carrière, Martin Royer
+# Author(s):       Mathieu Carrière, Martin Royer, Gard Spreemann
 #
 # Copyright (C) 2018-2020 Inria
 #
 # Modification(s):
 #   - 2020/06 Martin: ATOL integration
+#   - 2020/12 Gard: A more flexible Betti curve class capable of computing exact curves.
 #   - 2021/11 Vincent Rouvreau: factorize _automatic_sample_range
 
 import numpy as np
 from sklearn.base          import BaseEstimator, TransformerMixin
+from sklearn.exceptions    import NotFittedError
 from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
 from sklearn.neighbors     import DistanceMetric
 from sklearn.metrics       import pairwise
@@ -306,67 +308,170 @@ class Silhouette(BaseEstimator, TransformerMixin):
         """
         return self.fit_transform([diag])[0,:]
 
+
 class BettiCurve(BaseEstimator, TransformerMixin):
     """
-    This is a class for computing Betti curves from a list of persistence diagrams. A Betti curve is a 1D piecewise-constant function obtained from the rank function. It is sampled evenly on a given range and the vector of samples is returned. See https://www.researchgate.net/publication/316604237_Time_Series_Classification_via_Topological_Data_Analysis for more details.
+    Compute Betti curves from persistence diagrams. There are two modes of operation: with a predefined grid, and without. With a predefined grid, the class computes the Betti numbers at those grid points. Without a predefined grid, it can be fit to a list of persistence diagrams and produce a grid that consists of (at least) the filtration values at which at least one of those persistence diagrams changes Betti numbers, and then compute the Betti numbers at those grid points. In the latter mode, the exact Betti curve is computed for the entire real line.
+
+    Parameters
+    ----------
+    predefined_grid: 1d array, triple or None, default=None
+        Predefined filtration grid points at which to compute the Betti curves. Must be strictly ordered. Infinities are OK. If a triple of the form (l, u, n), the grid will be uniform from l to u in n steps. If None (default), a grid will be computed that captures all changes in Betti numbers in the provided data.
+
+    Attributes
+    ----------
+    grid_: 1d array
+        The grid on which the Betti numbers are computed. If predefined_grid was specified, grid_ will always be that grid, independently of data. If not, the grid is fitted to capture all filtration values at which the Betti numbers change.
+
+    Examples
+    --------
+    If pd is a persistence diagram and xs is a nonempty grid of finite values such that xs[0] >= pd.min(), then the result of
+    >>> bc = BettiCurve(xs)
+    >>> result = bc(pd)
+    and
+    >>> from scipy.interpolate import interp1d
+    >>> bc = BettiCurve(None)
+    >>> bettis = bc.fit_transform([pd])
+    >>> interp = interp1d(bc.grid_, bettis[0, :], kind="previous", fill_value="extrapolate")
+    >>> result = np.array(interp(xs), dtype=int)
+    are the same.
     """
-    def __init__(self, resolution=100, sample_range=[np.nan, np.nan]):
-        """
-        Constructor for the BettiCurve class.
 
-        Parameters:
-            resolution (int): number of sample for the piecewise-constant function (default 100).
-            sample_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
-        """
-        self.resolution, self.sample_range = resolution, sample_range
+    def __init__(self, predefined_grid = None):
+        if isinstance(predefined_grid, tuple):
+            if len(predefined_grid) != 3:
+                raise ValueError("Expected array, None or triple.")
 
-    def fit(self, X, y=None):
+            self.predefined_grid = np.linspace(predefined_grid[0], predefined_grid[1], predefined_grid[2])
+        else:
+            self.predefined_grid = predefined_grid
+
+        
+    def is_fitted(self):
+        return hasattr(self, "grid_")
+
+
+    def fit(self, X, y = None):
         """
-        Fit the BettiCurve class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
+        Compute a filtration grid that captures all changes in Betti numbers for all the given persistence diagrams, unless a predefined grid was provided.
 
-        Parameters:
-            X (list of n x 2 numpy arrays): input persistence diagrams.
-            y (n x 1 array): persistence diagram labels (unused).
+        Parameters
+        ----------
+        X: list of 2d arrays
+             Persistence diagrams.
+
+        y: None.
+             Ignored.
         """
-        self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+
+        if self.predefined_grid is None:
+            events = np.unique(np.concatenate([pd.flatten() for pd in X] + [[-np.inf]], axis=0))
+            self.grid_ = np.array(events)
+        else:
+            self.grid_ = np.array(self.predefined_grid)
+
+
+        #self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+
         return self
 
+
+    def fit_transform(self, X):
+        """
+        Find a sampling grid that captures all changes in Betti numbers, and compute those Betti numbers. The result is the same as fit(X) followed by transform(X), but potentially faster.
+        """
+
+        if self.predefined_grid is None:
+            if not X:
+                X = [np.zeros((0, 2))]
+            
+            N = len(X)
+
+            events = np.concatenate([pd.flatten(order="F") for pd in X], axis=0)
+            sorting = np.argsort(events)
+            offsets = np.zeros(1 + N, dtype=int)
+            for i in range(0, N):
+                offsets[i+1] = offsets[i] + 2*X[i].shape[0]
+            starts = offsets[0:N]
+            ends = offsets[1:N + 1] - 1
+
+            xs = [-np.inf]
+            bettis = [[0] for i in range(0, N)]
+            
+            for i in sorting:
+                j = np.searchsorted(ends, i)
+                delta = 1 if i - starts[j] < len(X[j]) else -1
+                if events[i] == xs[-1]:
+                    bettis[j][-1] += delta
+                else:
+                    xs.append(events[i])
+                    for k in range(0, j):
+                        bettis[k].append(bettis[k][-1])
+                    bettis[j].append(bettis[j][-1] + delta)
+                    for k in range(j+1, N):
+                        bettis[k].append(bettis[k][-1])
+
+            self.grid_ = np.array(xs)
+            return np.array(bettis, dtype=int)
+
+        else:
+            self.grid_ = self.predefined_grid
+            return self.transform(X)
+
+
     def transform(self, X):
         """
-        Compute the Betti curve for each persistence diagram individually and concatenate the results.
+        Compute Betti curves.
 
-        Parameters:
-            X (list of n x 2 numpy arrays): input persistence diagrams.
-    
-        Returns:
-            numpy array with shape (number of diagrams) x (**resolution**): output Betti curves.
+        Parameters
+        ----------
+        X: list of 2d arrays
+            Persistence diagrams.
+
+        Returns
+        -------
+        (len(X))x(len(self.grid_)) array of ints
+             Betti numbers of the given persistence diagrams at the grid points given in self.grid_.
         """
-        Xfit = []
-        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
-        step_x = x_values[1] - x_values[0]
 
-        for diagram in X:
-            diagram_int = np.clip(np.ceil((diagram[:,:2] - self.sample_range[0]) / step_x), 0, self.resolution).astype(int)
-            bc =  np.zeros(self.resolution)
-            for interval in diagram_int:
-                bc[interval[0]:interval[1]] += 1
-            Xfit.append(np.reshape(bc,[1,-1]))
+        if not self.is_fitted():
+            raise NotFittedError("Not fitted.")
 
-        Xfit = np.concatenate(Xfit, 0)
+        if not X:
+            X = [np.zeros((0, 2))]
+        
+        N = len(X)
+
+        events = np.concatenate([pd.flatten(order="F") for pd in X], axis=0)
+        sorting = np.argsort(events)
+        offsets = np.zeros(1 + N, dtype=int)
+        for i in range(0, N):
+            offsets[i+1] = offsets[i] + 2*X[i].shape[0]
+        starts = offsets[0:N]
+        ends = offsets[1:N + 1] - 1
+
+        bettis = [[0] for i in range(0, N)]
+
+        i = 0
+        for x in self.grid_:
+            while i < len(sorting) and events[sorting[i]] <= x:
+                j = np.searchsorted(ends, sorting[i])
+                delta = 1 if sorting[i] - starts[j] < len(X[j]) else -1
+                bettis[j][-1] += delta
+                i += 1
+            for k in range(0, N):
+                bettis[k].append(bettis[k][-1])
+
+        return np.array(bettis, dtype=int)[:, 0:-1]
 
-        return Xfit
 
     def __call__(self, diag):
         """
-        Apply BettiCurve on a single persistence diagram and outputs the result.
+        Shorthand for transform on a single persistence diagram.
+        """
+        return self.transform([diag])[0, :]
 
-        Parameters:
-            diag (n x 2 numpy array): input persistence diagram.
 
-        Returns:
-            numpy array with shape (**resolution**): output Betti curve.
-        """
-        return self.fit_transform([diag])[0,:]
 
 class Entropy(BaseEstimator, TransformerMixin):
     """
diff --git a/src/python/test/test_betti_curve_representations.py b/src/python/test/test_betti_curve_representations.py
new file mode 100755
index 00000000..3e77d760
--- /dev/null
+++ b/src/python/test/test_betti_curve_representations.py
@@ -0,0 +1,54 @@
+import numpy as np
+import scipy.interpolate
+
+from gudhi.representations.vector_methods import BettiCurve
+
+def test_betti_curve_is_irregular_betti_curve_followed_by_interpolation():
+    m = 10
+    n = 1000
+    pinf = 0.05
+    pzero = 0.05
+    res = 100
+    
+    pds = []
+    for i in range(0, m):
+        pd = np.zeros((n, 2))
+        pd[:, 0] = np.random.uniform(0, 10, n)
+        pd[:, 1] = np.random.uniform(pd[:, 0], 10, n)
+        pd[np.random.uniform(0, 1, n) < pzero, 0] = 0
+        pd[np.random.uniform(0, 1, n) < pinf, 1] = np.inf
+        pds.append(pd)
+    
+    bc = BettiCurve(None)
+    bc.fit(pds)
+    bettis = bc.transform(pds)
+
+    bc2 = BettiCurve(None)
+    bettis2 = bc2.fit_transform(pds)
+    assert((bc2.grid_ == bc.grid_).all())
+    assert((bettis2 == bettis).all())
+
+    for i in range(0, m):
+        grid = np.linspace(pds[i][np.isfinite(pds[i])].min(), pds[i][np.isfinite(pds[i])].max() + 1, res)
+        bc_gridded = BettiCurve(grid)
+        bc_gridded.fit([])
+        bettis_gridded = bc_gridded(pds[i])
+
+        interp = scipy.interpolate.interp1d(bc.grid_, bettis[i, :], kind="previous", fill_value="extrapolate")
+        bettis_interp = np.array(interp(grid), dtype=int)
+        assert((bettis_interp == bettis_gridded).all())
+
+
+def test_empty_with_predefined_grid():
+    random_grid = np.sort(np.random.uniform(0, 1, 100))
+    bc = BettiCurve(random_grid)
+    bettis = bc.fit_transform([])
+    assert((bc.grid_ == random_grid).all())
+    assert((bettis == 0).all())
+
+    
+def test_empty():
+    bc = BettiCurve()
+    bettis = bc.fit_transform([])
+    assert(bc.grid_ == [-np.inf])
+    assert((bettis == 0).all())
diff --git a/src/python/test/test_representations.py b/src/python/test/test_representations.py
index 93461f1e..d219ce7a 100755
--- a/src/python/test/test_representations.py
+++ b/src/python/test/test_representations.py
@@ -105,7 +105,6 @@ def test_dummy_atol():
 
 from gudhi.representations.vector_methods import BettiCurve
 
-
 def test_infinity():
     a = np.array([[1.0, 8.0], [2.0, np.inf], [3.0, 4.0]])
     c = BettiCurve(20, [0.0, 10.0])(a)
@@ -113,7 +112,6 @@ def test_infinity():
     assert c[7] == 3
     assert c[9] == 2
 
-
 def test_preprocessing_empty_diagrams():
     empty_diag = np.empty(shape = [0, 2])
     assert not np.any(BirthPersistenceTransform()(empty_diag))
@@ -169,3 +167,4 @@ def test_kernel_empty_diagrams():
 #    PersistenceScaleSpaceKernel(kernel_approx=RBFSampler(gamma=1./2, n_components=100000).fit(np.ones([1,2])))(empty_diag, empty_diag)
 #    PersistenceFisherKernel(bandwidth_fisher=1., bandwidth=1.)(empty_diag, empty_diag)
 #    PersistenceFisherKernel(bandwidth_fisher=1., bandwidth=1., kernel_approx=RBFSampler(gamma=1./2, n_components=100000).fit(np.ones([1,2])))(empty_diag, empty_diag)
+