From 4a64eef12722de3faa8ac73416aaea91658e20b6 Mon Sep 17 00:00:00 2001 From: ROUVREAU Vincent Date: Tue, 1 Jun 2021 19:12:50 +0200 Subject: Add cubical scikit learn interface documentation and example --- src/python/doc/cubical_complex_user.rst | 58 ++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index 6a211347..12971243 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -7,14 +7,19 @@ Cubical complex user manual Definition ---------- -===================================== ===================================== ===================================== -:Author: Pawel Dlotko :Since: GUDHI PYTHON 2.0.0 :License: GPL v3 -===================================== ===================================== ===================================== +.. list-table:: + :widths: 25 50 25 + :header-rows: 0 + + * - :Author: Pawel Dlotko + - :Since: GUDHI 2.0.0 + - :License: MIT + * - :doc:`cubical_complex_user` + - * :doc:`cubical_complex_ref` + * :doc:`periodic_cubical_complex_ref` + * :doc:`cubical_complex_sklearn_itf_ref` + - -+---------------------------------------------+----------------------------------------------------------------------+ -| :doc:`cubical_complex_user` | * :doc:`cubical_complex_ref` | -| | * :doc:`periodic_cubical_complex_ref` | -+---------------------------------------------+----------------------------------------------------------------------+ The cubical complex is an example of a structured complex useful in computational mathematics (specially rigorous numerics) and image analysis. @@ -163,4 +168,41 @@ Tutorial -------- This `notebook `_ -explains how to represent sublevels sets of functions using cubical complexes. \ No newline at end of file +explains how to represent sublevels sets of functions using cubical complexes. + +Scikit-learn like interface example +----------------------------------- + +.. plot:: + :include-source: + + # Standard scientific Python imports + import matplotlib.pyplot as plt + from sklearn import datasets + + # Import cubical persistence computation scikit-learn interfaces + from gudhi.sklearn.cubical_persistence import CubicalPersistence + # Import persistence representation + from gudhi.representations import PersistenceImage, DiagramSelector + + # Get the first 10 images from scikit-learn hand digits dataset + digits = datasets.load_digits().images[:10] + targets = datasets.load_digits().target[:10] + + # TDA pipeline + cub = CubicalPersistence(persistence_dim = 0, n_jobs=-2) + diags = cub.fit_transform(digits) + + finite = DiagramSelector(use=True, point_type="finite") + finite_diags = finite.fit_transform(diags) + + persim = PersistenceImage(im_range=[0,16,0,16], resolution=[16, 16]) + pers_images = persim.fit_transform(finite_diags) + + # Display persistence images + _, axes = plt.subplots(nrows=1, ncols=10, figsize=(15, 3)) + for ax, image, label in zip(axes, pers_images, targets): + ax.set_axis_off() + ax.imshow(image.reshape(16, 16), cmap=plt.cm.gray_r, interpolation='nearest') + ax.set_title('Target: %i' % label) + plt.show() -- cgit v1.2.3 From 546b059af6c0581d06bfe9cebbe853f2f7bd4589 Mon Sep 17 00:00:00 2001 From: ROUVREAU Vincent Date: Fri, 4 Jun 2021 11:56:59 +0200 Subject: Add a more relevant example inspired from https://dioscuri-tda.org/Paris_TDA_Tutorial_2021.html --- src/python/doc/cubical_complex_user.rst | 66 +++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 23 deletions(-) (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index 12971243..ebecb592 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -173,36 +173,56 @@ explains how to represent sublevels sets of functions using cubical complexes. Scikit-learn like interface example ----------------------------------- -.. plot:: - :include-source: +.. code-block:: python # Standard scientific Python imports - import matplotlib.pyplot as plt - from sklearn import datasets + import numpy as np + # Standard scikit-learn imports + from sklearn.datasets import fetch_openml + from sklearn.pipeline import Pipeline + from sklearn.model_selection import train_test_split + from sklearn.svm import SVC + from sklearn import metrics - # Import cubical persistence computation scikit-learn interfaces + # Import TDA pipeline requirements from gudhi.sklearn.cubical_persistence import CubicalPersistence - # Import persistence representation from gudhi.representations import PersistenceImage, DiagramSelector - # Get the first 10 images from scikit-learn hand digits dataset - digits = datasets.load_digits().images[:10] - targets = datasets.load_digits().target[:10] + X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False) - # TDA pipeline - cub = CubicalPersistence(persistence_dim = 0, n_jobs=-2) - diags = cub.fit_transform(digits) + # Target is: "is an eight ?" + y = (y == '8') * 1 + print('There are', np.sum(y), 'eights out of', len(y), 'numbers.') - finite = DiagramSelector(use=True, point_type="finite") - finite_diags = finite.fit_transform(diags) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) + pipe = Pipeline([('cub_pers', CubicalPersistence(persistence_dim = 0, dimensions=[28,28], n_jobs=-2)), + ('finite_diags', DiagramSelector(use=True, point_type="finite")), + ('pers_img', PersistenceImage(bandwidth=50, + weight=lambda x: x[1]**2, + im_range=[0,256,0,256], + resolution=[20,20])), + ('svc', SVC())]) - persim = PersistenceImage(im_range=[0,16,0,16], resolution=[16, 16]) - pers_images = persim.fit_transform(finite_diags) + predicted = pipe.predict(X_test) - # Display persistence images - _, axes = plt.subplots(nrows=1, ncols=10, figsize=(15, 3)) - for ax, image, label in zip(axes, pers_images, targets): - ax.set_axis_off() - ax.imshow(image.reshape(16, 16), cmap=plt.cm.gray_r, interpolation='nearest') - ax.set_title('Target: %i' % label) - plt.show() + print(f"Classification report for TDA pipeline {pipe}:\n" + f"{metrics.classification_report(y_test, predicted)}\n") + +.. code-block:: none + + There are 6825 eights out of 70000 numbers. + Classification report for TDA pipeline Pipeline(steps=[('cub_pers', + CubicalPersistence(dimensions=[28, 28], n_jobs=-2)), + ('finite_diags', DiagramSelector(use=True)), + ('pers_img', + PersistenceImage(bandwidth=50, im_range=[0, 256, 0, 256], + weight= at 0x7f3e54137ae8>)), + ('svc', SVC())]): + precision recall f1-score support + + 0 0.97 0.99 0.98 25284 + 1 0.92 0.68 0.78 2716 + + accuracy 0.96 28000 + macro avg 0.94 0.84 0.88 28000 + weighted avg 0.96 0.96 0.96 28000 \ No newline at end of file -- cgit v1.2.3 From b7de9c211e9cfe361aa7bba9be32b88570972c38 Mon Sep 17 00:00:00 2001 From: ROUVREAU Vincent Date: Mon, 7 Jun 2021 14:57:02 +0200 Subject: Improve documentation --- src/python/doc/cubical_complex_user.rst | 45 ++++++++++++++++++------- src/python/gudhi/sklearn/cubical_persistence.py | 18 ++++++++-- 2 files changed, 48 insertions(+), 15 deletions(-) (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index ebecb592..3fd9fd84 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -173,10 +173,24 @@ explains how to represent sublevels sets of functions using cubical complexes. Scikit-learn like interface example ----------------------------------- +In this example, hand written digits are used as an input. +a TDA scikit-learn pipeline is constructed and is composed of: + +#. :class:`~gudhi.sklearn.cubical_persistence.CubicalPersistence` that builds a cubical complex from the inputs and + returns its persistence diagrams +#. :class:`~gudhi.representations.DiagramSelector` that removes non-finite persistence diagrams values +#. :class:`~gudhi.representations.PersistenceImage` that builds the persistence images from persistence diagrams +#. `SVC `_ which is a scikit-learn support + vector classifier. + +This ML pipeline is trained to detect if the hand written digit is an '8' or not, thanks to the fact that an '8' has +two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected components in :math:`\mathbf{H}_0`. + .. code-block:: python # Standard scientific Python imports import numpy as np + # Standard scikit-learn imports from sklearn.datasets import fetch_openml from sklearn.pipeline import Pipeline @@ -188,25 +202,32 @@ Scikit-learn like interface example from gudhi.sklearn.cubical_persistence import CubicalPersistence from gudhi.representations import PersistenceImage, DiagramSelector - X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False) + X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) # Target is: "is an eight ?" - y = (y == '8') * 1 - print('There are', np.sum(y), 'eights out of', len(y), 'numbers.') + y = (y == "8") * 1 + print("There are", np.sum(y), "eights out of", len(y), "numbers.") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) - pipe = Pipeline([('cub_pers', CubicalPersistence(persistence_dim = 0, dimensions=[28,28], n_jobs=-2)), - ('finite_diags', DiagramSelector(use=True, point_type="finite")), - ('pers_img', PersistenceImage(bandwidth=50, - weight=lambda x: x[1]**2, - im_range=[0,256,0,256], - resolution=[20,20])), - ('svc', SVC())]) + pipe = Pipeline( + [ + ("cub_pers", CubicalPersistence(persistence_dim=0, dimensions=[28, 28], n_jobs=-2)), + ("finite_diags", DiagramSelector(use=True, point_type="finite")), + ( + "pers_img", + PersistenceImage(bandwidth=50, weight=lambda x: x[1] ** 2, im_range=[0, 256, 0, 256], resolution=[20, 20]), + ), + ("svc", SVC()), + ] + ) + # Learn from the train subset + pipe.fit(X_train, y_train) + # Predict from the test subset predicted = pipe.predict(X_test) - print(f"Classification report for TDA pipeline {pipe}:\n" - f"{metrics.classification_report(y_test, predicted)}\n") + print(f"Classification report for TDA pipeline {pipe}:\n" f"{metrics.classification_report(y_test, predicted)}\n") + .. code-block:: none diff --git a/src/python/gudhi/sklearn/cubical_persistence.py b/src/python/gudhi/sklearn/cubical_persistence.py index f4341bf6..251e240f 100644 --- a/src/python/gudhi/sklearn/cubical_persistence.py +++ b/src/python/gudhi/sklearn/cubical_persistence.py @@ -1,3 +1,12 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Vincent Rouvreau +# +# Copyright (C) 2021 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + from .. import CubicalComplex from sklearn.base import BaseEstimator, TransformerMixin @@ -17,7 +26,8 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): Constructor for the CubicalPersistence class. Parameters: - dimensions (list of int): A list of number of top dimensional cells. + dimensions (list of int): A list of number of top dimensional cells if cells filtration values will require + to be reshaped (cf. :func:`~gudhi.sklearn.cubical_persistence.CubicalPersistence.transform`) persistence_dim (int): The returned persistence diagrams dimension. Default value is `0`. min_persistence (float): The minimum persistence value to take into account (strictly greater than `min_persistence`). Default value is `0.0`. Sets `min_persistence` to `-1.0` to see all values. @@ -39,7 +49,7 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): def fit(self, X, Y=None): """ - Nothing to be done. + Nothing to be done, but useful when included in a scikit-learn Pipeline. """ return self @@ -56,7 +66,9 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): Compute all the cubical complexes and their associated persistence diagrams. Parameters: - X (list of list of double OR list of numpy.ndarray): List of cells filtration values. + X (list of list of double OR list of numpy.ndarray): List of cells filtration values that can be flatten if + dimensions is set in the constructor, or already with the correct shape in a numpy.ndarray (and + dimensions must not be set). Returns: Persistence diagrams -- cgit v1.2.3 From 5c35605763273cb34efe4227b6d748992e99ab09 Mon Sep 17 00:00:00 2001 From: ROUVREAU Vincent Date: Mon, 9 Aug 2021 10:38:31 +0200 Subject: Make CubicalPersistence returns all dimensions. Post processing DimensionSelector can select the desired dimension --- src/python/CMakeLists.txt | 1 + src/python/doc/cubical_complex_user.rst | 2 +- src/python/gudhi/sklearn/cubical_persistence.py | 49 +++++++++++++---- src/python/gudhi/sklearn/post_processing.py | 61 ++++++++++++++++++++++ .../test/test_sklearn_cubical_persistence.py | 21 ++++++-- src/python/test/test_sklearn_post_processing.py | 48 +++++++++++++++++ 6 files changed, 167 insertions(+), 15 deletions(-) create mode 100644 src/python/gudhi/sklearn/post_processing.py create mode 100644 src/python/test/test_sklearn_post_processing.py (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index a91aab37..b38bb9aa 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -546,6 +546,7 @@ if(PYTHONINTERP_FOUND) # sklearn if(SKLEARN_FOUND) add_gudhi_py_test(test_sklearn_cubical_persistence) + add_gudhi_py_test(test_sklearn_post_processing) endif() diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index 3fd9fd84..a140a279 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -211,7 +211,7 @@ two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected com X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) pipe = Pipeline( [ - ("cub_pers", CubicalPersistence(persistence_dim=0, dimensions=[28, 28], n_jobs=-2)), + ("cub_pers", CubicalPersistence(only_this_dim=0, dimensions=[28, 28], n_jobs=-2)), ("finite_diags", DiagramSelector(use=True, point_type="finite")), ( "pers_img", diff --git a/src/python/gudhi/sklearn/cubical_persistence.py b/src/python/gudhi/sklearn/cubical_persistence.py index 9af683d7..7b77000d 100644 --- a/src/python/gudhi/sklearn/cubical_persistence.py +++ b/src/python/gudhi/sklearn/cubical_persistence.py @@ -13,27 +13,44 @@ from sklearn.base import BaseEstimator, TransformerMixin # joblib is required by scikit-learn from joblib import Parallel, delayed +# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/ +# sequenceDiagram +# USER->>CubicalPersistence: fit_transform(X) +# CubicalPersistence->>thread1: _tranform(X[0]) +# CubicalPersistence->>thread2: _tranform(X[1]) +# Note right of CubicalPersistence: ... +# thread1->>CubicalPersistence: [array( H0(X[0]) ), array( H1(X[0]) )] +# thread2->>CubicalPersistence: [array( H0(X[1]) ), array( H1(X[1]) )] +# Note right of CubicalPersistence: ... +# CubicalPersistence->>USER: [[array( H0(X[0]) ), array( H1(X[0]) )],
[array( H0(X[1]) ), array( H1(X[1]) )],
...] + class CubicalPersistence(BaseEstimator, TransformerMixin): """ This is a class for computing the persistence diagrams from a cubical complex. """ - def __init__(self, dimensions=None, persistence_dim=0, homology_coeff_field=11, min_persistence=0., n_jobs=None): + def __init__(self, dimensions=None, max_persistence_dimension=0, only_this_dim=-1, homology_coeff_field=11, min_persistence=0., n_jobs=None): """ Constructor for the CubicalPersistence class. Parameters: dimensions (list of int): A list of number of top dimensional cells if cells filtration values will require to be reshaped (cf. :func:`~gudhi.sklearn.cubical_persistence.CubicalPersistence.transform`) - persistence_dim (int): The returned persistence diagrams dimension. Default value is `0`. + max_persistence_dimension (int): The returned persistence diagrams maximal dimension. Default value is `0`. + Ignored if `only_this_dim` is set. + only_this_dim (int): The returned persistence diagrams dimension. If `only_this_dim` is set, + `max_persistence_dimension` will be ignored. + Short circuit the use of :class:`~gudhi.sklearn.post_processing.DimensionSelector` when only one + dimension matters. homology_coeff_field (int): The homology coefficient field. Must be a prime number. Default value is 11. min_persistence (float): The minimum persistence value to take into account (strictly greater than `min_persistence`). Default value is `0.0`. Sets `min_persistence` to `-1.0` to see all values. n_jobs (int): cf. https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html """ self.dimensions = dimensions - self.persistence_dim = persistence_dim + self.max_persistence_dimension = max_persistence_dimension + self.only_this_dim = only_this_dim self.homology_coeff_field = homology_coeff_field self.min_persistence = min_persistence self.n_jobs = n_jobs @@ -49,8 +66,14 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): cubical_complex.compute_persistence( homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence ) - diagrams = cubical_complex.persistence_intervals_in_dimension(self.persistence_dim) - return diagrams + return [cubical_complex.persistence_intervals_in_dimension(dim) for dim in range(self.max_persistence_dimension + 1)] + + def __transform_only_this_dim(self, cells): + cubical_complex = CubicalComplex(top_dimensional_cells=cells, dimensions=self.dimensions) + cubical_complex.compute_persistence( + homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence + ) + return cubical_complex.persistence_intervals_in_dimension(self.only_this_dim) def transform(self, X, Y=None): """ @@ -58,12 +81,18 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): Parameters: X (list of list of double OR list of numpy.ndarray): List of cells filtration values that can be flatten if - dimensions is set in the constructor, or already with the correct shape in a numpy.ndarray (and - dimensions must not be set). + `dimensions` is set in the constructor, or already with the correct shape in a numpy.ndarray (and + `dimensions` must not be set). Returns: - Persistence diagrams + Persistence diagrams in the format: + - If `only_this_dim` was set to `n`: `[array( Hn(X[0]) ), array( Hn(X[1]) ), ...]` + - else: `[[array( H0(X[0]) ), array( H1(X[0]) ), ...], [array( H0(X[1]) ), array( H1(X[1]) ), ...], ...]` """ - # threads is preferred as cubical construction and persistence computation releases the GIL - return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X) + if self.only_this_dim == -1: + # threads is preferred as cubical construction and persistence computation releases the GIL + return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X) + else: + # threads is preferred as cubical construction and persistence computation releases the GIL + return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform_only_this_dim)(cells) for cells in X) diff --git a/src/python/gudhi/sklearn/post_processing.py b/src/python/gudhi/sklearn/post_processing.py new file mode 100644 index 00000000..79276e1e --- /dev/null +++ b/src/python/gudhi/sklearn/post_processing.py @@ -0,0 +1,61 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Vincent Rouvreau +# +# Copyright (C) 2021 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +from sklearn.base import BaseEstimator, TransformerMixin + +# joblib is required by scikit-learn +from joblib import Parallel, delayed + +# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/ +# sequenceDiagram +# USER->>DimensionSelector: fit_transform(
[[array( H0(X0) ), array( H1(X0) ), ...],
[array( H0(X1) ), array( H1(X1) ), ...],
...]) +# DimensionSelector->>thread1: _transform([array( H0(X0) ), array( H1(X0) )], ...) +# DimensionSelector->>thread2: _transform([array( H0(X1) ), array( H1(X1) )], ...) +# Note right of DimensionSelector: ... +# thread1->>DimensionSelector: array( Hn(X0) ) +# thread2->>DimensionSelector: array( Hn(X1) ) +# Note right of DimensionSelector: ... +# DimensionSelector->>USER: [array( Hn(X0) ),
array( Hn(X1) ),
...] + + +class DimensionSelector(BaseEstimator, TransformerMixin): + """ + This is a class to select persistence diagrams in a specific dimension. + """ + + def __init__(self, persistence_dimension=0, n_jobs=None): + """ + Constructor for the DimensionSelector class. + + Parameters: + persistence_dimension (int): The returned persistence diagrams dimension. Default value is `0`. + """ + self.persistence_dimension = persistence_dimension + self.n_jobs = n_jobs + + def fit(self, X, Y=None): + """ + Nothing to be done, but useful when included in a scikit-learn Pipeline. + """ + return self + + def transform(self, X, Y=None): + """ + Select persistence diagrams from its dimension. + + Parameters: + X (list of list of pairs): List of list of persistence pairs, i.e. + `[[array( H0(X0) ), array( H1(X0) ), ...], [array( H0(X1) ), array( H1(X1) ), ...], ...]` + + Returns: + Persistence diagrams in a specific dimension, i.e. + `[array( Hn(X0) ), array( Hn(X1), ...]` + """ + + return [persistence[self.persistence_dimension] for persistence in X] diff --git a/src/python/test/test_sklearn_cubical_persistence.py b/src/python/test/test_sklearn_cubical_persistence.py index c0082547..506985f1 100644 --- a/src/python/test/test_sklearn_cubical_persistence.py +++ b/src/python/test/test_sklearn_cubical_persistence.py @@ -16,17 +16,30 @@ __author__ = "Vincent Rouvreau" __copyright__ = "Copyright (C) 2021 Inria" __license__ = "MIT" +CUBICAL_PERSISTENCE_H0_IMG0 = np.array([[0., 6.], [0., 8.], [ 0., np.inf]]) + def test_simple_constructor_from_top_cells(): cells = datasets.load_digits().images[0] - cp = CubicalPersistence(persistence_dim = 0) + cp = CubicalPersistence(only_this_dim = 0) np.testing.assert_array_equal(cp._CubicalPersistence__transform(cells), - np.array([[0., 6.], [0., 8.], [ 0., np.inf]])) + [CUBICAL_PERSISTENCE_H0_IMG0]) + cp = CubicalPersistence(max_persistence_dimension = 2) + diags = cp._CubicalPersistence__transform(cells) + assert len(diags) == 3 + np.testing.assert_array_equal(diags[0], + CUBICAL_PERSISTENCE_H0_IMG0) def test_simple_constructor_from_top_cells_list(): digits = datasets.load_digits().images[:10] - cp = CubicalPersistence(persistence_dim = 0, n_jobs=-2) + cp = CubicalPersistence(only_this_dim = 0, n_jobs=-2) diags = cp.fit_transform(digits) assert len(diags) == 10 np.testing.assert_array_equal(diags[0], - np.array([[0., 6.], [0., 8.], [ 0., np.inf]])) + CUBICAL_PERSISTENCE_H0_IMG0) + + cp = CubicalPersistence(max_persistence_dimension = 1, n_jobs=-1) + diagsH0H1 = cp.fit_transform(digits) + assert len(diagsH0H1) == 10 + for idx in range(10): + np.testing.assert_array_equal(diags[idx], diagsH0H1[idx][0]) diff --git a/src/python/test/test_sklearn_post_processing.py b/src/python/test/test_sklearn_post_processing.py new file mode 100644 index 00000000..3a251d34 --- /dev/null +++ b/src/python/test/test_sklearn_post_processing.py @@ -0,0 +1,48 @@ +""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + Author(s): Vincent Rouvreau + + Copyright (C) 2021 Inria + + Modification(s): + - YYYY/MM Author: Description of the modification +""" + +from gudhi.sklearn.post_processing import DimensionSelector +import numpy as np +import pytest + +__author__ = "Vincent Rouvreau" +__copyright__ = "Copyright (C) 2021 Inria" +__license__ = "MIT" + +H0_0 = np.array([0., 0.]) +H1_0 = np.array([1., 0.]) +H0_1 = np.array([0., 1.]) +H1_1 = np.array([1., 1.]) +H0_2 = np.array([0., 2.]) +H1_2 = np.array([1., 2.]) + +def test_dimension_selector(): + X = [[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]] + ds = DimensionSelector(persistence_dimension = 0, n_jobs=-2) + h0 = ds.fit_transform(X) + np.testing.assert_array_equal(h0[0], + H0_0) + np.testing.assert_array_equal(h0[1], + H0_1) + np.testing.assert_array_equal(h0[2], + H0_2) + + ds = DimensionSelector(persistence_dimension = 1, n_jobs=-1) + h1 = ds.fit_transform(X) + np.testing.assert_array_equal(h1[0], + H1_0) + np.testing.assert_array_equal(h1[1], + H1_1) + np.testing.assert_array_equal(h1[2], + H1_2) + + ds = DimensionSelector(persistence_dimension = 2, n_jobs=-2) + with pytest.raises(IndexError): + h2 = ds.fit_transform([[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]]) -- cgit v1.2.3 From f55ae9257a7006fd0906a21bd3033f47b2958c6b Mon Sep 17 00:00:00 2001 From: VincentRouvreau Date: Mon, 4 Oct 2021 16:46:01 +0200 Subject: review: modification proposed from EB + HM comments fix --- src/python/CMakeLists.txt | 7 ++- src/python/doc/cubical_complex_user.rst | 5 +- src/python/gudhi/representations/preprocessing.py | 51 ++++++++++++++++++- src/python/gudhi/sklearn/cubical_persistence.py | 40 ++++++++------- src/python/gudhi/sklearn/post_processing.py | 57 ---------------------- .../test/test_representations_preprocessing.py | 39 +++++++++++++++ .../test/test_sklearn_cubical_persistence.py | 16 +++--- src/python/test/test_sklearn_post_processing.py | 43 ---------------- 8 files changed, 123 insertions(+), 135 deletions(-) delete mode 100644 src/python/gudhi/sklearn/post_processing.py create mode 100644 src/python/test/test_representations_preprocessing.py delete mode 100644 src/python/test/test_sklearn_post_processing.py (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index b38bb9aa..2ff05384 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -519,6 +519,11 @@ if(PYTHONINTERP_FOUND) add_gudhi_py_test(test_representations) endif() + # Representations preprocessing + if(SKLEARN_FOUND) + add_gudhi_py_test(test_representations_preprocessing) + endif() + # Time Delay add_gudhi_py_test(test_time_delay) @@ -546,10 +551,8 @@ if(PYTHONINTERP_FOUND) # sklearn if(SKLEARN_FOUND) add_gudhi_py_test(test_sklearn_cubical_persistence) - add_gudhi_py_test(test_sklearn_post_processing) endif() - # Set missing or not modules set(GUDHI_MODULES ${GUDHI_MODULES} "python" CACHE INTERNAL "GUDHI_MODULES") else(CYTHON_FOUND) diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index a140a279..e62a4395 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -211,7 +211,10 @@ two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected com X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) pipe = Pipeline( [ - ("cub_pers", CubicalPersistence(only_this_dim=0, dimensions=[28, 28], n_jobs=-2)), + ("cub_pers", CubicalPersistence(persistence_dimension=0, dimensions=[28, 28], n_jobs=-2)), + # Or for multiple persistence dimension computation + # ("cub_pers", CubicalPersistence(persistence_dimension=[0, 1], dimensions=[28, 28], n_jobs=-2)), + # ("H0_diags", DimensionSelector(index=0), # where index is the index in persistence_dimension array ("finite_diags", DiagramSelector(use=True, point_type="finite")), ( "pers_img", diff --git a/src/python/gudhi/representations/preprocessing.py b/src/python/gudhi/representations/preprocessing.py index a8545349..823e3954 100644 --- a/src/python/gudhi/representations/preprocessing.py +++ b/src/python/gudhi/representations/preprocessing.py @@ -1,10 +1,11 @@ # This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. # See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. -# Author(s): Mathieu Carrière +# Author(s): Mathieu Carrière, Vincent Rouvreau # # Copyright (C) 2018-2019 Inria # # Modification(s): +# - 2021/10 Vincent Rouvreau: Add DimensionSelector # - YYYY/MM Author: Description of the modification import numpy as np @@ -363,3 +364,51 @@ class DiagramSelector(BaseEstimator, TransformerMixin): n x 2 numpy array: extracted persistence diagram. """ return self.fit_transform([diag])[0] + + +# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/ +# sequenceDiagram +# USER->>DimensionSelector: fit_transform(
[[array( Hi(X0) ), array( Hj(X0) ), ...],
[array( Hi(X1) ), array( Hj(X1) ), ...],
...]) +# DimensionSelector->>thread1: _transform([array( Hi(X0) ), array( Hj(X0) )], ...) +# DimensionSelector->>thread2: _transform([array( Hi(X1) ), array( Hj(X1) )], ...) +# Note right of DimensionSelector: ... +# thread1->>DimensionSelector: array( Hn(X0) ) +# thread2->>DimensionSelector: array( Hn(X1) ) +# Note right of DimensionSelector: ... +# DimensionSelector->>USER: [array( Hn(X0) ),
array( Hn(X1) ),
...] + +class DimensionSelector(BaseEstimator, TransformerMixin): + """ + This is a class to select persistence diagrams in a specific dimension from its index. + """ + + def __init__(self, index=0): + """ + Constructor for the DimensionSelector class. + + Parameters: + index (int): The returned persistence diagrams dimension index. Default value is `0`. + """ + self.index = index + + def fit(self, X, Y=None): + """ + Nothing to be done, but useful when included in a scikit-learn Pipeline. + """ + return self + + def transform(self, X, Y=None): + """ + Select persistence diagrams from its dimension. + + Parameters: + X (list of list of pairs): List of list of persistence pairs, i.e. + `[[array( Hi(X0) ), array( Hj(X0) ), ...], [array( Hi(X1) ), array( Hj(X1) ), ...], ...]` + + Returns: + list of pairs: + Persistence diagrams in a specific dimension. i.e. if `index` was set to `m` and `Hn` is at index `n` of + the input, it returns `[array( Hn(X0) ), array( Hn(X1), ...]` + """ + + return [persistence[self.index] for persistence in X] diff --git a/src/python/gudhi/sklearn/cubical_persistence.py b/src/python/gudhi/sklearn/cubical_persistence.py index 329c9435..454cdd07 100644 --- a/src/python/gudhi/sklearn/cubical_persistence.py +++ b/src/python/gudhi/sklearn/cubical_persistence.py @@ -33,8 +33,7 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): def __init__( self, dimensions=None, - max_persistence_dimension=0, - only_this_dim=-1, + persistence_dimension=-1, homology_coeff_field=11, min_persistence=0.0, n_jobs=None, @@ -45,20 +44,16 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): Parameters: dimensions (list of int): A list of number of top dimensional cells if cells filtration values will require to be reshaped (cf. :func:`~gudhi.sklearn.cubical_persistence.CubicalPersistence.transform`) - max_persistence_dimension (int): The returned persistence diagrams maximal dimension. Default value is `0`. - Ignored if `only_this_dim` is set. - only_this_dim (int): The returned persistence diagrams dimension. If `only_this_dim` is set, - `max_persistence_dimension` will be ignored. - Short circuit the use of :class:`~gudhi.sklearn.post_processing.DimensionSelector` when only one - dimension matters. + persistence_dimension (int or list of int): The returned persistence diagrams dimension(s). + Short circuit the use of :class:`~gudhi.representations.preprocessing.DimensionSelector` when only one + dimension matters (in other words, when `persistence_dimension` is an int). homology_coeff_field (int): The homology coefficient field. Must be a prime number. Default value is 11. min_persistence (float): The minimum persistence value to take into account (strictly greater than - `min_persistence`). Default value is `0.0`. Sets `min_persistence` to `-1.0` to see all values. + `min_persistence`). Default value is `0.0`. Set `min_persistence` to `-1.0` to see all values. n_jobs (int): cf. https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html """ self.dimensions = dimensions - self.max_persistence_dimension = max_persistence_dimension - self.only_this_dim = only_this_dim + self.persistence_dimension = persistence_dimension self.homology_coeff_field = homology_coeff_field self.min_persistence = min_persistence self.n_jobs = n_jobs @@ -75,7 +70,7 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence ) return [ - cubical_complex.persistence_intervals_in_dimension(dim) for dim in range(self.max_persistence_dimension + 1) + cubical_complex.persistence_intervals_in_dimension(dim) for dim in self.persistence_dimension ] def __transform_only_this_dim(self, cells): @@ -83,28 +78,31 @@ class CubicalPersistence(BaseEstimator, TransformerMixin): cubical_complex.compute_persistence( homology_coeff_field=self.homology_coeff_field, min_persistence=self.min_persistence ) - return cubical_complex.persistence_intervals_in_dimension(self.only_this_dim) + return cubical_complex.persistence_intervals_in_dimension(self.persistence_dimension) def transform(self, X, Y=None): """ Compute all the cubical complexes and their associated persistence diagrams. Parameters: - X (list of list of double OR list of numpy.ndarray): List of cells filtration values that can be flatten if - `dimensions` is set in the constructor, or already with the correct shape in a numpy.ndarray (and + X (list of list of double OR list of numpy.ndarray): List of cells filtration values that should be flatten + if `dimensions` is set in the constructor, or already with the correct shape in a numpy.ndarray (and `dimensions` must not be set). Returns: + list of pairs or list of list of pairs: Persistence diagrams in the format: - - If `only_this_dim` was set to `n`: `[array( Hn(X[0]) ), array( Hn(X[1]) ), ...]` - - else: `[[array( H0(X[0]) ), array( H1(X[0]) ), ...], [array( H0(X[1]) ), array( H1(X[1]) ), ...], ...]` + - If `persistence_dimension` was set to `n`: `[array( Hn(X[0]) ), array( Hn(X[1]) ), ...]` + - If `persistence_dimension` was set to `[i, j]`: `[[array( Hi(X[0]) ), array( Hj(X[0]) )], [array( Hi(X[1]) ), array( Hj(X[1]) )], ...]` """ - if self.only_this_dim == -1: - # threads is preferred as cubical construction and persistence computation releases the GIL - return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X) - else: + # Depends on persistence_dimension is an integer or a list of integer (else case) + if isinstance(self.persistence_dimension, int): # threads is preferred as cubical construction and persistence computation releases the GIL return Parallel(n_jobs=self.n_jobs, prefer="threads")( delayed(self.__transform_only_this_dim)(cells) for cells in X ) + else: + # threads is preferred as cubical construction and persistence computation releases the GIL + return Parallel(n_jobs=self.n_jobs, prefer="threads")(delayed(self.__transform)(cells) for cells in X) + diff --git a/src/python/gudhi/sklearn/post_processing.py b/src/python/gudhi/sklearn/post_processing.py deleted file mode 100644 index 3b12466b..00000000 --- a/src/python/gudhi/sklearn/post_processing.py +++ /dev/null @@ -1,57 +0,0 @@ -# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. -# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. -# Author(s): Vincent Rouvreau -# -# Copyright (C) 2021 Inria -# -# Modification(s): -# - YYYY/MM Author: Description of the modification - -from sklearn.base import BaseEstimator, TransformerMixin - -# Mermaid sequence diagram - https://mermaid-js.github.io/mermaid-live-editor/ -# sequenceDiagram -# USER->>DimensionSelector: fit_transform(
[[array( H0(X0) ), array( H1(X0) ), ...],
[array( H0(X1) ), array( H1(X1) ), ...],
...]) -# DimensionSelector->>thread1: _transform([array( H0(X0) ), array( H1(X0) )], ...) -# DimensionSelector->>thread2: _transform([array( H0(X1) ), array( H1(X1) )], ...) -# Note right of DimensionSelector: ... -# thread1->>DimensionSelector: array( Hn(X0) ) -# thread2->>DimensionSelector: array( Hn(X1) ) -# Note right of DimensionSelector: ... -# DimensionSelector->>USER: [array( Hn(X0) ),
array( Hn(X1) ),
...] - - -class DimensionSelector(BaseEstimator, TransformerMixin): - """ - This is a class to select persistence diagrams in a specific dimension. - """ - - def __init__(self, persistence_dimension=0): - """ - Constructor for the DimensionSelector class. - - Parameters: - persistence_dimension (int): The returned persistence diagrams dimension. Default value is `0`. - """ - self.persistence_dimension = persistence_dimension - - def fit(self, X, Y=None): - """ - Nothing to be done, but useful when included in a scikit-learn Pipeline. - """ - return self - - def transform(self, X, Y=None): - """ - Select persistence diagrams from its dimension. - - Parameters: - X (list of list of pairs): List of list of persistence pairs, i.e. - `[[array( H0(X0) ), array( H1(X0) ), ...], [array( H0(X1) ), array( H1(X1) ), ...], ...]` - - Returns: - Persistence diagrams in a specific dimension, i.e. - `[array( Hn(X0) ), array( Hn(X1), ...]` - """ - - return [persistence[self.persistence_dimension] for persistence in X] diff --git a/src/python/test/test_representations_preprocessing.py b/src/python/test/test_representations_preprocessing.py new file mode 100644 index 00000000..838cf30c --- /dev/null +++ b/src/python/test/test_representations_preprocessing.py @@ -0,0 +1,39 @@ +""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + Author(s): Vincent Rouvreau + + Copyright (C) 2021 Inria + + Modification(s): + - YYYY/MM Author: Description of the modification +""" + +from gudhi.representations.preprocessing import DimensionSelector +import numpy as np +import pytest + +H0_0 = np.array([0.0, 0.0]) +H1_0 = np.array([1.0, 0.0]) +H0_1 = np.array([0.0, 1.0]) +H1_1 = np.array([1.0, 1.0]) +H0_2 = np.array([0.0, 2.0]) +H1_2 = np.array([1.0, 2.0]) + + +def test_dimension_selector(): + X = [[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]] + ds = DimensionSelector(index=0) + h0 = ds.fit_transform(X) + np.testing.assert_array_equal(h0[0], H0_0) + np.testing.assert_array_equal(h0[1], H0_1) + np.testing.assert_array_equal(h0[2], H0_2) + + ds = DimensionSelector(index=1) + h1 = ds.fit_transform(X) + np.testing.assert_array_equal(h1[0], H1_0) + np.testing.assert_array_equal(h1[1], H1_1) + np.testing.assert_array_equal(h1[2], H1_2) + + ds = DimensionSelector(index=2) + with pytest.raises(IndexError): + h2 = ds.fit_transform([[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]]) diff --git a/src/python/test/test_sklearn_cubical_persistence.py b/src/python/test/test_sklearn_cubical_persistence.py index 488495d1..bd728a29 100644 --- a/src/python/test/test_sklearn_cubical_persistence.py +++ b/src/python/test/test_sklearn_cubical_persistence.py @@ -12,32 +12,28 @@ from gudhi.sklearn.cubical_persistence import CubicalPersistence import numpy as np from sklearn import datasets -__author__ = "Vincent Rouvreau" -__copyright__ = "Copyright (C) 2021 Inria" -__license__ = "MIT" - CUBICAL_PERSISTENCE_H0_IMG0 = np.array([[0.0, 6.0], [0.0, 8.0], [0.0, np.inf]]) def test_simple_constructor_from_top_cells(): cells = datasets.load_digits().images[0] - cp = CubicalPersistence(only_this_dim=0) - np.testing.assert_array_equal(cp._CubicalPersistence__transform(cells), [CUBICAL_PERSISTENCE_H0_IMG0]) - cp = CubicalPersistence(max_persistence_dimension=2) + cp = CubicalPersistence(persistence_dimension=0) + np.testing.assert_array_equal(cp._CubicalPersistence__transform_only_this_dim(cells), CUBICAL_PERSISTENCE_H0_IMG0) + cp = CubicalPersistence(persistence_dimension=[0, 2]) diags = cp._CubicalPersistence__transform(cells) - assert len(diags) == 3 + assert len(diags) == 2 np.testing.assert_array_equal(diags[0], CUBICAL_PERSISTENCE_H0_IMG0) def test_simple_constructor_from_top_cells_list(): digits = datasets.load_digits().images[:10] - cp = CubicalPersistence(only_this_dim=0, n_jobs=-2) + cp = CubicalPersistence(persistence_dimension=0, n_jobs=-2) diags = cp.fit_transform(digits) assert len(diags) == 10 np.testing.assert_array_equal(diags[0], CUBICAL_PERSISTENCE_H0_IMG0) - cp = CubicalPersistence(max_persistence_dimension=1, n_jobs=-1) + cp = CubicalPersistence(persistence_dimension=[0, 1], n_jobs=-1) diagsH0H1 = cp.fit_transform(digits) assert len(diagsH0H1) == 10 for idx in range(10): diff --git a/src/python/test/test_sklearn_post_processing.py b/src/python/test/test_sklearn_post_processing.py deleted file mode 100644 index e60eadc6..00000000 --- a/src/python/test/test_sklearn_post_processing.py +++ /dev/null @@ -1,43 +0,0 @@ -""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. - See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. - Author(s): Vincent Rouvreau - - Copyright (C) 2021 Inria - - Modification(s): - - YYYY/MM Author: Description of the modification -""" - -from gudhi.sklearn.post_processing import DimensionSelector -import numpy as np -import pytest - -__author__ = "Vincent Rouvreau" -__copyright__ = "Copyright (C) 2021 Inria" -__license__ = "MIT" - -H0_0 = np.array([0.0, 0.0]) -H1_0 = np.array([1.0, 0.0]) -H0_1 = np.array([0.0, 1.0]) -H1_1 = np.array([1.0, 1.0]) -H0_2 = np.array([0.0, 2.0]) -H1_2 = np.array([1.0, 2.0]) - - -def test_dimension_selector(): - X = [[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]] - ds = DimensionSelector(persistence_dimension=0) - h0 = ds.fit_transform(X) - np.testing.assert_array_equal(h0[0], H0_0) - np.testing.assert_array_equal(h0[1], H0_1) - np.testing.assert_array_equal(h0[2], H0_2) - - ds = DimensionSelector(persistence_dimension=1) - h1 = ds.fit_transform(X) - np.testing.assert_array_equal(h1[0], H1_0) - np.testing.assert_array_equal(h1[1], H1_1) - np.testing.assert_array_equal(h1[2], H1_2) - - ds = DimensionSelector(persistence_dimension=2) - with pytest.raises(IndexError): - h2 = ds.fit_transform([[H0_0, H1_0], [H0_1, H1_1], [H0_2, H1_2]]) -- cgit v1.2.3 From 8f14977760d05f8f08d2a7babdc197da27a6c53a Mon Sep 17 00:00:00 2001 From: Vincent Rouvreau Date: Fri, 5 Nov 2021 11:28:42 +0100 Subject: change doc according to proposal --- src/python/doc/cubical_complex_sklearn_itf_ref.rst | 88 +++++++++++++++++++- src/python/doc/cubical_complex_sum.inc | 24 +++--- src/python/doc/cubical_complex_user.rst | 95 +--------------------- 3 files changed, 100 insertions(+), 107 deletions(-) (limited to 'src/python/doc/cubical_complex_user.rst') diff --git a/src/python/doc/cubical_complex_sklearn_itf_ref.rst b/src/python/doc/cubical_complex_sklearn_itf_ref.rst index b5c7a2e5..c585f9ab 100644 --- a/src/python/doc/cubical_complex_sklearn_itf_ref.rst +++ b/src/python/doc/cubical_complex_sklearn_itf_ref.rst @@ -2,8 +2,8 @@ .. To get rid of WARNING: document isn't included in any toctree -Cubical complex persistence scikit-learn like interfaces reference manual -######################################################################### +Cubical complex persistence scikit-learn like interface +####################################################### .. list-table:: :widths: 40 30 30 @@ -13,8 +13,90 @@ Cubical complex persistence scikit-learn like interfaces reference manual - :License: MIT - :Requires: `Scikit-learn `_ +Cubical complex persistence scikit-learn like interface example +--------------------------------------------------------------- + +In this example, hand written digits are used as an input. +a TDA scikit-learn pipeline is constructed and is composed of: + +#. :class:`~gudhi.sklearn.cubical_persistence.CubicalPersistence` that builds a cubical complex from the inputs and + returns its persistence diagrams +#. :class:`~gudhi.representations.DiagramSelector` that removes non-finite persistence diagrams values +#. :class:`~gudhi.representations.PersistenceImage` that builds the persistence images from persistence diagrams +#. `SVC `_ which is a scikit-learn support + vector classifier. + +This ML pipeline is trained to detect if the hand written digit is an '8' or not, thanks to the fact that an '8' has +two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected components in :math:`\mathbf{H}_0`. + +.. code-block:: python + + # Standard scientific Python imports + import numpy as np + + # Standard scikit-learn imports + from sklearn.datasets import fetch_openml + from sklearn.pipeline import Pipeline + from sklearn.model_selection import train_test_split + from sklearn.svm import SVC + from sklearn import metrics + + # Import TDA pipeline requirements + from gudhi.sklearn.cubical_persistence import CubicalPersistence + from gudhi.representations import PersistenceImage, DiagramSelector + + X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) + + # Target is: "is an eight ?" + y = (y == "8") * 1 + print("There are", np.sum(y), "eights out of", len(y), "numbers.") + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) + pipe = Pipeline( + [ + ("cub_pers", CubicalPersistence(persistence_dimension=0, dimensions=[28, 28], n_jobs=-2)), + # Or for multiple persistence dimension computation + # ("cub_pers", CubicalPersistence(persistence_dimension=[0, 1], dimensions=[28, 28], n_jobs=-2)), + # ("H0_diags", DimensionSelector(index=0), # where index is the index in persistence_dimension array + ("finite_diags", DiagramSelector(use=True, point_type="finite")), + ( + "pers_img", + PersistenceImage(bandwidth=50, weight=lambda x: x[1] ** 2, im_range=[0, 256, 0, 256], resolution=[20, 20]), + ), + ("svc", SVC()), + ] + ) + + # Learn from the train subset + pipe.fit(X_train, y_train) + # Predict from the test subset + predicted = pipe.predict(X_test) + + print(f"Classification report for TDA pipeline {pipe}:\n" f"{metrics.classification_report(y_test, predicted)}\n") + +.. code-block:: none + + There are 6825 eights out of 70000 numbers. + Classification report for TDA pipeline Pipeline(steps=[('cub_pers', + CubicalPersistence(dimensions=[28, 28], n_jobs=-2)), + ('finite_diags', DiagramSelector(use=True)), + ('pers_img', + PersistenceImage(bandwidth=50, im_range=[0, 256, 0, 256], + weight= at 0x7f3e54137ae8>)), + ('svc', SVC())]): + precision recall f1-score support + + 0 0.97 0.99 0.98 25284 + 1 0.92 0.68 0.78 2716 + + accuracy 0.96 28000 + macro avg 0.94 0.84 0.88 28000 + weighted avg 0.96 0.96 0.96 28000 + +Cubical complex persistence scikit-learn like interface reference +----------------------------------------------------------------- .. autoclass:: gudhi.sklearn.cubical_persistence.CubicalPersistence :members: :special-members: __init__ - :show-inheritance: + :show-inheritance: \ No newline at end of file diff --git a/src/python/doc/cubical_complex_sum.inc b/src/python/doc/cubical_complex_sum.inc index 2a1bde8d..e2fd55bb 100644 --- a/src/python/doc/cubical_complex_sum.inc +++ b/src/python/doc/cubical_complex_sum.inc @@ -1,13 +1,17 @@ .. table:: :widths: 30 40 30 - +--------------------------------------------------------------------------+----------------------------------------------------------------------+-----------------------------+ - | .. figure:: | The cubical complex represents a grid as a cell complex with | :Author: Pawel Dlotko | - | ../../doc/Bitmap_cubical_complex/Cubical_complex_representation.png | cells of all dimensions. | :Since: GUDHI 2.0.0 | - | :alt: Cubical complex representation | | :License: MIT | - | :figclass: align-center | | | - +--------------------------------------------------------------------------+----------------------------------------------------------------------+-----------------------------+ - | * :doc:`cubical_complex_user` | * :doc:`cubical_complex_ref` | - | | * :doc:`periodic_cubical_complex_ref` | - | | * :doc:`cubical_complex_sklearn_itf_ref` | - +--------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+ + +--------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------+ + | .. figure:: | The cubical complex represents a grid as a cell complex with | :Author: Pawel Dlotko | + | ../../doc/Bitmap_cubical_complex/Cubical_complex_representation.png | cells of all dimensions. | :Since: GUDHI 2.0.0 | + | :alt: Cubical complex representation | | :License: MIT | + | :figclass: align-center | | | + +--------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------+ + | * :doc:`cubical_complex_user` | * :doc:`cubical_complex_ref` | + | | * :doc:`periodic_cubical_complex_ref` | + +--------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------+ + | .. image:: | * :doc:`cubical_complex_sklearn_itf_ref` | :Requires: `Scikit-learn `_ | + | img/sklearn.png | | | + | :target: https://scikit-learn.org | | | + | :height: 30 | | | + +--------------------------------------------------------------------------+--------------------------------------------------------------+-------------------------------------------------------------+ diff --git a/src/python/doc/cubical_complex_user.rst b/src/python/doc/cubical_complex_user.rst index e62a4395..42a23875 100644 --- a/src/python/doc/cubical_complex_user.rst +++ b/src/python/doc/cubical_complex_user.rst @@ -7,19 +7,7 @@ Cubical complex user manual Definition ---------- -.. list-table:: - :widths: 25 50 25 - :header-rows: 0 - - * - :Author: Pawel Dlotko - - :Since: GUDHI 2.0.0 - - :License: MIT - * - :doc:`cubical_complex_user` - - * :doc:`cubical_complex_ref` - * :doc:`periodic_cubical_complex_ref` - * :doc:`cubical_complex_sklearn_itf_ref` - - - +.. include:: cubical_complex_sum.inc The cubical complex is an example of a structured complex useful in computational mathematics (specially rigorous numerics) and image analysis. @@ -169,84 +157,3 @@ Tutorial This `notebook `_ explains how to represent sublevels sets of functions using cubical complexes. - -Scikit-learn like interface example ------------------------------------ - -In this example, hand written digits are used as an input. -a TDA scikit-learn pipeline is constructed and is composed of: - -#. :class:`~gudhi.sklearn.cubical_persistence.CubicalPersistence` that builds a cubical complex from the inputs and - returns its persistence diagrams -#. :class:`~gudhi.representations.DiagramSelector` that removes non-finite persistence diagrams values -#. :class:`~gudhi.representations.PersistenceImage` that builds the persistence images from persistence diagrams -#. `SVC `_ which is a scikit-learn support - vector classifier. - -This ML pipeline is trained to detect if the hand written digit is an '8' or not, thanks to the fact that an '8' has -two holes in :math:`\mathbf{H}_1`, or, like in this example, three connected components in :math:`\mathbf{H}_0`. - -.. code-block:: python - - # Standard scientific Python imports - import numpy as np - - # Standard scikit-learn imports - from sklearn.datasets import fetch_openml - from sklearn.pipeline import Pipeline - from sklearn.model_selection import train_test_split - from sklearn.svm import SVC - from sklearn import metrics - - # Import TDA pipeline requirements - from gudhi.sklearn.cubical_persistence import CubicalPersistence - from gudhi.representations import PersistenceImage, DiagramSelector - - X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False) - - # Target is: "is an eight ?" - y = (y == "8") * 1 - print("There are", np.sum(y), "eights out of", len(y), "numbers.") - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) - pipe = Pipeline( - [ - ("cub_pers", CubicalPersistence(persistence_dimension=0, dimensions=[28, 28], n_jobs=-2)), - # Or for multiple persistence dimension computation - # ("cub_pers", CubicalPersistence(persistence_dimension=[0, 1], dimensions=[28, 28], n_jobs=-2)), - # ("H0_diags", DimensionSelector(index=0), # where index is the index in persistence_dimension array - ("finite_diags", DiagramSelector(use=True, point_type="finite")), - ( - "pers_img", - PersistenceImage(bandwidth=50, weight=lambda x: x[1] ** 2, im_range=[0, 256, 0, 256], resolution=[20, 20]), - ), - ("svc", SVC()), - ] - ) - - # Learn from the train subset - pipe.fit(X_train, y_train) - # Predict from the test subset - predicted = pipe.predict(X_test) - - print(f"Classification report for TDA pipeline {pipe}:\n" f"{metrics.classification_report(y_test, predicted)}\n") - - -.. code-block:: none - - There are 6825 eights out of 70000 numbers. - Classification report for TDA pipeline Pipeline(steps=[('cub_pers', - CubicalPersistence(dimensions=[28, 28], n_jobs=-2)), - ('finite_diags', DiagramSelector(use=True)), - ('pers_img', - PersistenceImage(bandwidth=50, im_range=[0, 256, 0, 256], - weight= at 0x7f3e54137ae8>)), - ('svc', SVC())]): - precision recall f1-score support - - 0 0.97 0.99 0.98 25284 - 1 0.92 0.68 0.78 2716 - - accuracy 0.96 28000 - macro avg 0.94 0.84 0.88 28000 - weighted avg 0.96 0.96 0.96 28000 \ No newline at end of file -- cgit v1.2.3