review: modification proposed from EB + HM comments fix

author: VincentRouvreau <vincent.rouvreau@inria.fr> 2021-10-04 16:46:01 +0200
committer: VincentRouvreau <vincent.rouvreau@inria.fr> 2021-10-04 16:46:01 +0200
commit: f55ae9257a7006fd0906a21bd3033f47b2958c6b (patch)
tree: fffcb459c60a9dd9582beb06bf522a051d9c2b08
parent: cad4e4bff56dee7fb05be770108775b7623648ad (diff)
7 files changed, 88 insertions, 100 deletions
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index b38bb9aa..2ff05384 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -519,6 +519,11 @@ if(PYTHONINTERP_FOUND)
       add_gudhi_py_test(test_representations)
     endif()
 
+    # Representations preprocessing
+    if(SKLEARN_FOUND)
+      add_gudhi_py_test(test_representations_preprocessing)
+    endif()
+
     # Time Delay
     add_gudhi_py_test(test_time_delay)
 
@@ -546,10 +551,8 @@ if(PYTHONINTERP_FOUND)
     # sklearn
     if(SKLEARN_FOUND)
       add_gudhi_py_test(test_sklearn_cubical_persistence)
-      add_gudhi_py_test(test_sklearn_post_processing)
     endif()
 
-
     # Set missing or not modules
     set(GUDHI_MODULES ${GUDHI_MODULES} "python" CACHE INTERNAL "GUDHI_MODULES")
   else(CYTHON_FOUND)
diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst
index a140a279..e62a4395 100644
--- a/src/python/doc/cubical_complex_user.rst
+++ b/src/python/doc/cubical_complex_user.rst
@@ -211,7 +211,10 @@ two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected com
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
     pipe = Pipeline(
         [
-            ("cub_pers", CubicalPersistence(only_this_dim=0, dimensions=[28, 28], n_jobs=-2)),
+            ("cub_pers", CubicalPersistence(persistence_dimension=0, dimensions=[28, 28], n_jobs=-2)),
+            # Or for multiple persistence dimension computation
+            # ("cub_pers", CubicalPersistence(persistence_dimension=[0, 1], dimensions=[28, 28], n_jobs=-2)),
+            # ("H0_diags", DimensionSelector(index=0), # where index is the index in persistence_dimension array
             ("finite_diags", DiagramSelector(use=True, point_type="finite")),
             (
                 "pers_img",
diff --git a/src/python/gudhi/representations/preprocessing.py b/src/python/gudhi/representations/preprocessing.py
index a8545349..823e3954 100644
--- a/src/python/gudhi/representations/preprocessing.py
+++ b/src/python/gudhi/representations/preprocessing.py
@@ -1,10 +1,11 @@
 # This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
 # See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
-# Author(s):       Mathieu Carrière
+# Author(s):       Mathieu Carrière, Vincent Rouvreau
 #
 # Copyright (C) 2018-2019 Inria
 #
 # Modification(s):
+#   - 2021/10 Vincent Rouvreau: Add DimensionSelector
 #   - YYYY/MM Author: Description of the modification
 
 import numpy as np
@@ -363,3 +364,51 @@ class DiagramSelector(BaseEstimator, TransformerMixin):
             n x 2 numpy array: extracted persistence diagram.
         """
         return self.fit_transform([diag])[0]
+
+
+# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/
+# sequenceDiagram
+#     USER->>DimensionSelector: fit_transform(<br/>[[array( Hi(X0) ), array( Hj(X0) ), ...],<br/> [array( Hi(X1) ), array( Hj(X1) ), ...],<br/> ...])
+#     DimensionSelector->>thread1: _transform([array( Hi(X0) ), array( Hj(X0) )], ...)
+#     DimensionSelector->>thread2: _transform([array( Hi(X1) ), array( Hj(X1) )], ...)
+#     Note right of DimensionSelector: ...
+#     thread1->>DimensionSelector: array( Hn(X0) )
+#     thread2->>DimensionSelector: array( Hn(X1) )
+#     Note right of DimensionSelector: ...
+#     DimensionSelector->>USER: [array( Hn(X0) ), <br/> array( Hn(X1) ), <br/> ...]
+
+class DimensionSelector(BaseEstimator, TransformerMixin):
+    """
+    This is a class to select persistence diagrams in a specific dimension from its index.
+    """
+
+    def __init__(self, index=0):
+        """
+        Constructor for the DimensionSelector class.
+
+        Parameters:
+            index (int): The returned persistence diagrams dimension index. Default value is `0`.
+        """
+        self.index = index
+
+    def fit(self, X, Y=None):
+        """
+        Nothing to be done, but useful when included in a scikit-learn Pipeline.
+        """
+        return self
+
+    def transform(self, X, Y=None):
+        """
+        Select persistence diagrams from its dimension.
+
+        Parameters:
+            X (list of list of pairs): List of list of persistence pairs, i.e.
+                `[[array( Hi(X0) ), array( Hj(X0) ), ...], [array( Hi(X1) ), array( Hj(X1) ), ...], ...]` 
+
+        Returns:
+            list of pairs:
+            Persistence diagrams in a specific dimension. i.e. if `index` was set to `m` and `Hn` is at index `n` of
+            the input, it returns `[array( Hn(X0) ), array( Hn(X1), ...]`
+        """
+
+        return [persistence[self.index] for persistence in X]
diff --git a/src/python/gudhi/sklearn/cubical_persistence.py b/src/python/gudhi/sklearn/cubical_persistence.py
index 329c9435..454cdd07 100644
--- a/src/python/gudhi/sklearn/cubical_persistence.py
+++ b/src/python/gudhi/sklearn/cubical_persistence.py
@@ -33,8 +33,7 @@ class CubicalPersistence(BaseEstimator, TransformerMixin):
     def __init__(
         self,
         dimensions=None,
-        max_persistence_dimension=0,
-        only_this_dim=-1,
+        persistence_dimension=-1,
         homology_coeff_field=11,
         min_persistence=0.0,
         n_jobs=None,
@@ -45,20 +44,16 @@ class CubicalPersistence(BaseEstimator, TransformerMixin):
         Parameters:
             dimensions (list of int): A list of number of top dimensional cells if cells filtration values will require
                 to be reshaped (cf. :func:`~gudhi.sklearn.cubical_persistence.CubicalPersistence.transform`)
-            max_persistence_dimension (int): The returned persistence diagrams maximal dimension. Default value is `0`.
-                Ignored if `only_this_dim` is set.
-            only_this_dim (int): The returned persistence diagrams dimension. If `only_this_dim` is set,
-                `max_persistence_dimension` will be ignored. 
-                Short circuit the use of :class:`~gudhi.sklearn.post_processing.DimensionSelector` when only one
-                dimension matters.
+            persistence_dimension (int or list of int): The returned persistence diagrams dimension(s).
+                Short circuit the use of :class:`~gudhi.representations.preprocessing.DimensionSelector` when only one
+                dimension matters (in other words, when `persistence_dimension` is an int).
             homology_coeff_field (int): The homology coefficient field. Must be a prime number. Default value is 11.
             min_persistence (float): The minimum persistence value to take into account (strictly greater than
-                `min_persistence`). Default value is `0.0`. Sets `min_persistence` to `-1.0` to see all values.
+                `min_persistence`). Default value is `0.0`. Set `min_persistence` to `-1.0` to see all values.
             n_jobs (int): cf. https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html
         """
         self.dimensions = dimensions
-        self.max_persistence_dimension = max_persistence_dimension
-        self.only_this_dim = only_this_dim
+        self.persistence_dimension = persistence_dimension
         self.homology_coeff_field = homology_coeff_field
         self.min_persistence = min_persistence
         self.n_jobs = n_jobs
@@ -75,7 +70,7 @@ class CubicalPersistence(BaseEstimator, TransformerMixin):
             homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence
         )
         return [
-            cubical_complex.persistence_intervals_in_dimension(dim) for dim in range(self.max_persistence_dimension + 1)
+            cubical_complex.persistence_intervals_in_dimension(dim) for dim in self.persistence_dimension
         ]
 
     def __transform_only_this_dim(self, cells):
@@ -83,28 +78,31 @@ class CubicalPersistence(BaseEstimator, TransformerMixin):
         cubical_complex.compute_persistence(
             homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence
         )
-        return cubical_complex.persistence_intervals_in_dimension(self.only_this_dim)
+        return cubical_complex.persistence_intervals_in_dimension(self.persistence_dimension)
 
     def transform(self, X, Y=None):
         """
         Compute all the cubical complexes and their associated persistence diagrams.
 
         Parameters:
-            X (list of list of double OR list of numpy.ndarray): List of cells filtration values that can be flatten if
-                `dimensions` is set in the constructor, or already with the correct shape in a numpy.ndarray (and
+            X (list of list of double OR list of numpy.ndarray): List of cells filtration values that should be flatten
+                if `dimensions` is set in the constructor, or already with the correct shape in a numpy.ndarray (and
                 `dimensions` must not be set).
 
         Returns:
+            list of pairs or list of list of pairs:
             Persistence diagrams in the format:
-            - If `only_this_dim` was set to `n`: `[array( Hn(X[0]) ), array( Hn(X[1]) ), ...]` 
-            - else: `[[array( H0(X[0]) ), array( H1(X[0]) ), ...], [array( H0(X[1]) ), array( H1(X[1]) ), ...], ...]` 
+              - If `persistence_dimension` was set to `n`: `[array( Hn(X[0]) ), array( Hn(X[1]) ), ...]` 
+              - If `persistence_dimension` was set to `[i, j]`: `[[array( Hi(X[0]) ), array( Hj(X[0]) )], [array( Hi(X[1]) ), array( Hj(X[1]) )], ...]`
         """
 
-        if self.only_this_dim == -1:
-            # threads is preferred as cubical construction and persistence computation releases the GIL
-            return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X)
-        else:
+        # Depends on persistence_dimension is an integer or a list of integer (else case)
+        if isinstance(self.persistence_dimension, int):
             # threads is preferred as cubical construction and persistence computation releases the GIL
             return Parallel(n_jobs=self.n_jobs, prefer="threads")(
                 delayed(self.__transform_only_this_dim)(cells) for cells in X
             )
+        else:
+            # threads is preferred as cubical construction and persistence computation releases the GIL
+            return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X)
+
diff --git a/src/python/gudhi/sklearn/post_processing.py b/src/python/gudhi/sklearn/post_processing.py
deleted file mode 100644
index 3b12466b..00000000
--- a/src/python/gudhi/sklearn/post_processing.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
-# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
-# Author(s):       Vincent Rouvreau
-#
-# Copyright (C) 2021 Inria
-#
-# Modification(s):
-#   - YYYY/MM Author: Description of the modification
-
-from sklearn.base import BaseEstimator, TransformerMixin
-
-# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/
-# sequenceDiagram
-#     USER->>DimensionSelector: fit_transform(<br/>[[array( H0(X0) ), array( H1(X0) ), ...],<br/> [array( H0(X1) ), array( H1(X1) ), ...],<br/> ...])
-#     DimensionSelector->>thread1: _transform([array( H0(X0) ), array( H1(X0) )], ...)
-#     DimensionSelector->>thread2: _transform([array( H0(X1) ), array( H1(X1) )], ...)
-#     Note right of DimensionSelector: ...
-#     thread1->>DimensionSelector: array( Hn(X0) )
-#     thread2->>DimensionSelector: array( Hn(X1) )
-#     Note right of DimensionSelector: ...
-#     DimensionSelector->>USER: [array( Hn(X0) ), <br/> array( Hn(X1) ), <br/> ...]
-
-
-class DimensionSelector(BaseEstimator, TransformerMixin):
-    """
-    This is a class to select persistence diagrams in a specific dimension.
-    """
-
-    def __init__(self, persistence_dimension=0):
-        """
-        Constructor for the DimensionSelector class.
-
-        Parameters:
-            persistence_dimension (int): The returned persistence diagrams dimension. Default value is `0`.
-        """
-        self.persistence_dimension = persistence_dimension
-
-    def fit(self, X, Y=None):
-        """
-        Nothing to be done, but useful when included in a scikit-learn Pipeline.
-        """
-        return self
-
-    def transform(self, X, Y=None):
-        """
-        Select persistence diagrams from its dimension.
-
-        Parameters:
-            X (list of list of pairs): List of list of persistence pairs, i.e.
-            `[[array( H0(X0) ), array( H1(X0) ), ...], [array( H0(X1) ), array( H1(X1) ), ...], ...]` 
-
-        Returns:
-            Persistence diagrams in a specific dimension, i.e.
-            `[array( Hn(X0) ), array( Hn(X1), ...]`
-        """
-
-        return [persistence[self.persistence_dimension] for persistence in X]
diff --git a/src/python/test/test_sklearn_post_processing.py b/src/python/test/test_representations_preprocessing.py
index e60eadc6..838cf30c 100644
--- a/src/python/test/test_sklearn_post_processing.py
+++ b/src/python/test/test_representations_preprocessing.py
@@ -8,14 +8,10 @@
       - YYYY/MM Author: Description of the modification
 """
 
-from gudhi.sklearn.post_processing import DimensionSelector
+from gudhi.representations.preprocessing import DimensionSelector
 import numpy as np
 import pytest
 
-__author__ = "Vincent Rouvreau"
-__copyright__ = "Copyright (C) 2021 Inria"
-__license__ = "MIT"
-
 H0_0 = np.array([0.0, 0.0])
 H1_0 = np.array([1.0, 0.0])
 H0_1 = np.array([0.0, 1.0])
@@ -26,18 +22,18 @@ H1_2 = np.array([1.0, 2.0])
 
 def test_dimension_selector():
     X = [[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]]
-    ds = DimensionSelector(persistence_dimension=0)
+    ds = DimensionSelector(index=0)
     h0 = ds.fit_transform(X)
     np.testing.assert_array_equal(h0[0], H0_0)
     np.testing.assert_array_equal(h0[1], H0_1)
     np.testing.assert_array_equal(h0[2], H0_2)
 
-    ds = DimensionSelector(persistence_dimension=1)
+    ds = DimensionSelector(index=1)
     h1 = ds.fit_transform(X)
     np.testing.assert_array_equal(h1[0], H1_0)
     np.testing.assert_array_equal(h1[1], H1_1)
     np.testing.assert_array_equal(h1[2], H1_2)
 
-    ds = DimensionSelector(persistence_dimension=2)
+    ds = DimensionSelector(index=2)
     with pytest.raises(IndexError):
         h2 = ds.fit_transform([[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]])
diff --git a/src/python/test/test_sklearn_cubical_persistence.py b/src/python/test/test_sklearn_cubical_persistence.py
index 488495d1..bd728a29 100644
--- a/src/python/test/test_sklearn_cubical_persistence.py
+++ b/src/python/test/test_sklearn_cubical_persistence.py
@@ -12,32 +12,28 @@ from gudhi.sklearn.cubical_persistence import CubicalPersistence
 import numpy as np
 from sklearn import datasets
 
-__author__ = "Vincent Rouvreau"
-__copyright__ = "Copyright (C) 2021 Inria"
-__license__ = "MIT"
-
 CUBICAL_PERSISTENCE_H0_IMG0 = np.array([[0.0, 6.0], [0.0, 8.0], [0.0, np.inf]])
 
 
 def test_simple_constructor_from_top_cells():
     cells = datasets.load_digits().images[0]
-    cp = CubicalPersistence(only_this_dim=0)
-    np.testing.assert_array_equal(cp._CubicalPersistence__transform(cells), [CUBICAL_PERSISTENCE_H0_IMG0])
-    cp = CubicalPersistence(max_persistence_dimension=2)
+    cp = CubicalPersistence(persistence_dimension=0)
+    np.testing.assert_array_equal(cp._CubicalPersistence__transform_only_this_dim(cells), CUBICAL_PERSISTENCE_H0_IMG0)
+    cp = CubicalPersistence(persistence_dimension=[0, 2])
     diags = cp._CubicalPersistence__transform(cells)
-    assert len(diags) == 3
+    assert len(diags) == 2
     np.testing.assert_array_equal(diags[0], CUBICAL_PERSISTENCE_H0_IMG0)
 
 
 def test_simple_constructor_from_top_cells_list():
     digits = datasets.load_digits().images[:10]
-    cp = CubicalPersistence(only_this_dim=0, n_jobs=-2)
+    cp = CubicalPersistence(persistence_dimension=0, n_jobs=-2)
 
     diags = cp.fit_transform(digits)
     assert len(diags) == 10
     np.testing.assert_array_equal(diags[0], CUBICAL_PERSISTENCE_H0_IMG0)
 
-    cp = CubicalPersistence(max_persistence_dimension=1, n_jobs=-1)
+    cp = CubicalPersistence(persistence_dimension=[0, 1], n_jobs=-1)
     diagsH0H1 = cp.fit_transform(digits)
     assert len(diagsH0H1) == 10
     for idx in range(10):
author	VincentRouvreau <vincent.rouvreau@inria.fr>	2021-10-04 16:46:01 +0200
committer	VincentRouvreau <vincent.rouvreau@inria.fr>	2021-10-04 16:46:01 +0200
commit	f55ae9257a7006fd0906a21bd3033f47b2958c6b (patch)
tree	fffcb459c60a9dd9582beb06bf522a051d9c2b08
parent	cad4e4bff56dee7fb05be770108775b7623648ad (diff)