Merge pull request #636 from wreise/optimize_silhouettes

Optimize silhouettes
author: Vincent Rouvreau <10407034+VincentRouvreau@users.noreply.github.com> 2022-11-04 09:43:36 +0100
committer: GitHub <noreply@github.com> 2022-11-04 09:43:36 +0100
commit: 7595644442365412cf7afce56eafea85342da07f (patch)
tree: f21e228ac3023cff80cb9ed7a4132bb14c0e3f63
parent: da8c945ceddab0494fc58d71066daf95e63294ee (diff)
parent: 04b7f0315502b7650c8ad6df3dc9d4d1a1a5316e (diff)
2 files changed, 107 insertions, 80 deletions
diff --git a/src/python/gudhi/representations/vector_methods.py b/src/python/gudhi/representations/vector_methods.py
index 69ff5e1e..a169aee8 100644
--- a/src/python/gudhi/representations/vector_methods.py
+++ b/src/python/gudhi/representations/vector_methods.py
@@ -85,7 +85,7 @@ class PersistenceImage(BaseEstimator, TransformerMixin):
 
             Xfit.append(image.flatten()[np.newaxis,:])
 
-        Xfit = np.concatenate(Xfit,0)
+        Xfit = np.concatenate(Xfit, 0)
 
         return Xfit
 
@@ -123,6 +123,15 @@ def _automatic_sample_range(sample_range, X, y):
                 pass
         return sample_range
 
+
+def _trim_on_edges(x, are_endpoints_nan):
+    if are_endpoints_nan[0]:
+        x = x[1:]
+    if are_endpoints_nan[1]:
+        x = x[:-1]
+    return x
+
+
 class Landscape(BaseEstimator, TransformerMixin):
     """
     This is a class for computing persistence landscapes from a list of persistence diagrams. A persistence landscape is a collection of 1D piecewise-linear functions computed from the rank function associated to the persistence diagram. These piecewise-linear functions are then sampled evenly on a given range and the corresponding vectors of samples are concatenated and returned. See http://jmlr.org/papers/v16/bubenik15a.html for more details.
@@ -149,6 +158,8 @@ class Landscape(BaseEstimator, TransformerMixin):
             y (n x 1 array): persistence diagram labels (unused).
         """
         self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+        self.im_range = np.linspace(self.sample_range[0], self.sample_range[1], self.new_resolution)
+        self.im_range = _trim_on_edges(self.im_range, self.nan_in_range)
         return self
 
     def transform(self, X):
@@ -161,53 +172,26 @@ class Landscape(BaseEstimator, TransformerMixin):
         Returns:
             numpy array with shape (number of diagrams) x (number of samples = **num_landscapes** x **resolution**): output persistence landscapes.
         """
-        num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.new_resolution)
-        step_x = x_values[1] - x_values[0]
-
-        for i in range(num_diag):
-
-            diagram, num_pts_in_diag = X[i], X[i].shape[0]
 
-            ls = np.zeros([self.num_landscapes, self.new_resolution])
+        Xfit = []
+        x_values = self.im_range
+        for diag in X:
+            midpoints, heights = (diag[:, 0] + diag[:, 1]) / 2., (diag[:, 1] - diag[:, 0]) / 2.
+            tent_functions = np.maximum(heights[None, :] - np.abs(x_values[:, None] - midpoints[None, :]), 0)
+            n_points = diag.shape[0]
+            # Complete the array with zeros to get the right number of landscapes
+            if self.num_landscapes > n_points:
+                tent_functions = np.concatenate(
+                    [tent_functions, np.zeros((tent_functions.shape[0], self.num_landscapes-n_points))],
+                    axis=1
+                )
+            tent_functions.partition(tent_functions.shape[1]-self.num_landscapes, axis=1)
+            landscapes = np.sort(tent_functions[:, -self.num_landscapes:], axis=1)[:, ::-1].T
 
-            events = []
-            for j in range(self.new_resolution):
-                events.append([])
+            landscapes = np.sqrt(2) * np.ravel(landscapes)
+            Xfit.append(landscapes)
 
-            for j in range(num_pts_in_diag):
-                [px,py] = diagram[j,:2]
-                min_idx = np.clip(np.ceil((px          - self.sample_range[0]) / step_x).astype(int), 0, self.new_resolution)
-                mid_idx = np.clip(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0, self.new_resolution)
-                max_idx = np.clip(np.ceil((py          - self.sample_range[0]) / step_x).astype(int), 0, self.new_resolution)
-
-                if min_idx < self.new_resolution and max_idx > 0:
-
-                    landscape_value = self.sample_range[0] + min_idx * step_x - px
-                    for k in range(min_idx, mid_idx):
-                        events[k].append(landscape_value)
-                        landscape_value += step_x
-
-                    landscape_value = py - self.sample_range[0] - mid_idx * step_x
-                    for k in range(mid_idx, max_idx):
-                        events[k].append(landscape_value)
-                        landscape_value -= step_x
-
-            for j in range(self.new_resolution):
-                events[j].sort(reverse=True)
-                for k in range( min(self.num_landscapes, len(events[j])) ):
-                    ls[k,j] = events[j][k]
-
-            if self.nan_in_range[0]:
-                ls = ls[:,1:]
-            if self.nan_in_range[1]:
-                ls = ls[:,:-1]
-            ls = np.sqrt(2)*np.reshape(ls,[1,-1])
-            Xfit.append(ls)
-
-        Xfit = np.concatenate(Xfit,0)
-
-        return Xfit
+        return np.stack(Xfit, axis=0)
 
     def __call__(self, diag):
         """
@@ -219,7 +203,7 @@ class Landscape(BaseEstimator, TransformerMixin):
         Returns:
             numpy array with shape (number of samples = **num_landscapes** x **resolution**): output persistence landscape.
         """
-        return self.fit_transform([diag])[0,:]
+        return self.fit_transform([diag])[0, :]
 
 class Silhouette(BaseEstimator, TransformerMixin):
     """
@@ -235,6 +219,8 @@ class Silhouette(BaseEstimator, TransformerMixin):
             sample_range ([double, double]): minimum and maximum for the weighted average domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn evenly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
         """
         self.weight, self.resolution, self.sample_range = weight, resolution, sample_range
+        self.nan_in_range = np.isnan(np.array(self.sample_range))
+        self.new_resolution = self.resolution + self.nan_in_range.sum()
 
     def fit(self, X, y=None):
         """
@@ -245,6 +231,8 @@ class Silhouette(BaseEstimator, TransformerMixin):
             y (n x 1 array): persistence diagram labels (unused).
         """
         self.sample_range = _automatic_sample_range(np.array(self.sample_range), X, y)
+        self.im_range = np.linspace(self.sample_range[0], self.sample_range[1], self.new_resolution)
+        self.im_range = _trim_on_edges(self.im_range, self.nan_in_range)
         return self
 
     def transform(self, X):
@@ -257,44 +245,19 @@ class Silhouette(BaseEstimator, TransformerMixin):
         Returns:
             numpy array with shape (number of diagrams) x (**resolution**): output persistence silhouettes.
         """
-        num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
-        step_x = x_values[1] - x_values[0]
-
-        for i in range(num_diag):
-
-            diagram, num_pts_in_diag = X[i], X[i].shape[0]
+        Xfit = []
+        x_values = self.im_range
 
-            sh, weights = np.zeros(self.resolution), np.zeros(num_pts_in_diag)
-            for j in range(num_pts_in_diag):
-                weights[j] = self.weight(diagram[j,:])
+        for diag in X:
+            midpoints, heights = (diag[:, 0] + diag[:, 1]) / 2., (diag[:, 1] - diag[:, 0]) / 2.
+            weights = np.array([self.weight(pt) for pt in diag])
             total_weight = np.sum(weights)
 
-            for j in range(num_pts_in_diag):
-
-                [px,py] = diagram[j,:2]
-                weight  = weights[j] / total_weight
-                min_idx = np.clip(np.ceil((px          - self.sample_range[0]) / step_x).astype(int), 0, self.resolution)
-                mid_idx = np.clip(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0, self.resolution)
-                max_idx = np.clip(np.ceil((py          - self.sample_range[0]) / step_x).astype(int), 0, self.resolution)
-
-                if min_idx < self.resolution and max_idx > 0:
-
-                    silhouette_value = self.sample_range[0] + min_idx * step_x - px
-                    for k in range(min_idx, mid_idx):
-                        sh[k] += weight * silhouette_value
-                        silhouette_value += step_x
-
-                    silhouette_value = py - self.sample_range[0] - mid_idx * step_x
-                    for k in range(mid_idx, max_idx):
-                        sh[k] += weight * silhouette_value
-                        silhouette_value -= step_x
-
-            Xfit.append(np.reshape(np.sqrt(2) * sh, [1,-1]))
-
-        Xfit = np.concatenate(Xfit, 0)
+            tent_functions = np.maximum(heights[None, :] - np.abs(x_values[:, None] - midpoints[None, :]), 0)
+            silhouette = np.sum(weights[None, :] / total_weight * tent_functions, axis=1)
+            Xfit.append(silhouette * np.sqrt(2))
 
-        return Xfit 
+        return np.stack(Xfit, axis=0)
 
     def __call__(self, diag):
         """
diff --git a/src/python/test/test_representations.py b/src/python/test/test_representations.py
index 4a455bb6..58caab21 100755
--- a/src/python/test/test_representations.py
+++ b/src/python/test/test_representations.py
@@ -187,3 +187,67 @@ def test_kernel_empty_diagrams():
 #    PersistenceFisherKernel(bandwidth_fisher=1., bandwidth=1.)(empty_diag, empty_diag)
 #    PersistenceFisherKernel(bandwidth_fisher=1., bandwidth=1., kernel_approx=RBFSampler(gamma=1./2, n_components=100000).fit(np.ones([1,2])))(empty_diag, empty_diag)
 
+
+def test_silhouette_permutation_invariance():
+    dgm = _n_diags(1)[0]
+    dgm_permuted = dgm[np.random.permutation(dgm.shape[0]).astype(int)]
+    random_resolution = random.randint(50, 100) * 10
+    slt = Silhouette(resolution=random_resolution, weight=pow(2))
+
+    assert np.all(np.isclose(slt(dgm), slt(dgm_permuted)))
+
+
+def test_silhouette_multiplication_invariance():
+    dgm = _n_diags(1)[0]
+    n_repetitions = np.random.randint(2, high=10)
+    dgm_augmented = np.repeat(dgm, repeats=n_repetitions, axis=0)
+
+    random_resolution = random.randint(50, 100) * 10
+    slt = Silhouette(resolution=random_resolution, weight=pow(2))
+    assert np.all(np.isclose(slt(dgm), slt(dgm_augmented)))
+
+
+def test_silhouette_numeric():
+    dgm = np.array([[2., 3.], [5., 6.]])
+    slt = Silhouette(resolution=9, weight=pow(1), sample_range=[2., 6.])
+    #slt.fit([dgm])
+    # x_values = array([2., 2.5, 3., 3.5, 4., 4.5, 5., 5.5, 6.])
+
+    expected_silhouette = np.array([0., 0.5, 0., 0., 0., 0., 0., 0.5, 0.])/np.sqrt(2)
+    output_silhouette = slt(dgm)
+    assert np.all(np.isclose(output_silhouette, expected_silhouette))
+
+
+def test_landscape_small_persistence_invariance():
+    dgm = np.array([[2., 6.], [2., 5.], [3., 7.]])
+    small_persistence_pts = np.random.rand(10, 2)
+    small_persistence_pts[:, 1] += small_persistence_pts[:, 0]
+    small_persistence_pts += np.min(dgm)
+    dgm_augmented = np.concatenate([dgm, small_persistence_pts], axis=0)
+
+    lds = Landscape(num_landscapes=2, resolution=5)
+    lds_dgm, lds_dgm_augmented = lds(dgm), lds(dgm_augmented)
+
+    assert np.all(np.isclose(lds_dgm, lds_dgm_augmented))
+
+
+def test_landscape_numeric():
+    dgm = np.array([[2., 6.], [3., 5.]])
+    lds_ref = np.array([
+        0., 0.5, 1., 1.5, 2., 1.5, 1., 0.5, 0.,  # tent of [2, 6]
+        0., 0., 0., 0.5, 1., 0.5, 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        0., 0., 0., 0., 0., 0., 0., 0., 0.,
+    ])
+    lds_ref *= np.sqrt(2)
+    lds = Landscape(num_landscapes=4, resolution=9, sample_range=[2., 6.])
+    lds_dgm = lds(dgm)
+    assert np.all(np.isclose(lds_dgm, lds_ref))
+
+
+def test_landscape_nan_range():
+    dgm = np.array([[2., 6.], [3., 5.]])
+    lds = Landscape(num_landscapes=2, resolution=9, sample_range=[np.nan, 6.])
+    lds_dgm = lds(dgm)
+    assert (lds.sample_range[0] == 2) & (lds.sample_range[1] == 6)
+    assert lds.new_resolution == 10
author	Vincent Rouvreau <10407034+VincentRouvreau@users.noreply.github.com>	2022-11-04 09:43:36 +0100
committer	GitHub <noreply@github.com>	2022-11-04 09:43:36 +0100
commit	7595644442365412cf7afce56eafea85342da07f (patch)
tree	f21e228ac3023cff80cb9ed7a4132bb14c0e3f63
parent	da8c945ceddab0494fc58d71066daf95e63294ee (diff)
parent	04b7f0315502b7650c8ad6df3dc9d4d1a1a5316e (diff)