summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorVincent Rouvreau <10407034+VincentRouvreau@users.noreply.github.com>2019-11-04 17:41:48 +0100
committerGitHub <noreply@github.com>2019-11-04 17:41:48 +0100
commit6e5f3f2c5ed908774c9005fa3ba07694bb2c6b0c (patch)
tree3e9242cf413e1ca63c258dd704ca04049fccf7a8 /src
parent8e7fabec7a8b79b8f0248ec580e4cd7950f9cec1 (diff)
parentee4934750e8c9dbdee4874d56921aeb9bf7b7bb7 (diff)
Merge pull request #95 from tlacombe/wdist-theo
wasserstein distance added on fork
Diffstat (limited to 'src')
-rw-r--r--src/cmake/modules/GUDHI_third_party_libraries.cmake1
-rw-r--r--src/python/CMakeLists.txt65
-rw-r--r--src/python/doc/index.rst5
-rw-r--r--src/python/doc/installation.rst14
-rw-r--r--src/python/doc/wasserstein_distance_sum.inc14
-rw-r--r--src/python/doc/wasserstein_distance_user.rst40
-rw-r--r--src/python/gudhi/wasserstein.py99
-rwxr-xr-xsrc/python/test/test_wasserstein_distance.py50
8 files changed, 260 insertions, 28 deletions
diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake
index 360a230b..5c76eeae 100644
--- a/src/cmake/modules/GUDHI_third_party_libraries.cmake
+++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake
@@ -125,6 +125,7 @@ if( PYTHONINTERP_FOUND )
find_python_module("numpy")
find_python_module("scipy")
find_python_module("sphinx")
+ find_python_module("ot")
endif()
if(NOT GUDHI_PYTHON_PATH)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 5508cbc7..1b1684e1 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -49,6 +49,7 @@ if(PYTHONINTERP_FOUND)
set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'alpha_complex', ")
set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'euclidean_witness_complex', ")
set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'euclidean_strong_witness_complex', ")
+ set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'wasserstein', ")
add_gudhi_debug_info("Python version ${PYTHON_VERSION_STRING}")
add_gudhi_debug_info("Cython version ${CYTHON_VERSION}")
@@ -64,6 +65,9 @@ if(PYTHONINTERP_FOUND)
if(SCIPY_FOUND)
add_gudhi_debug_info("Scipy version ${SCIPY_VERSION}")
endif()
+ if(OT_FOUND)
+ add_gudhi_debug_info("POT version ${OT_VERSION}")
+ endif()
set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ")
set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ")
@@ -199,6 +203,7 @@ if(PYTHONINTERP_FOUND)
# Other .py files
file(COPY "gudhi/persistence_graphical_tools.py" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi")
+ file(COPY "gudhi/wasserstein.py" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi")
add_custom_command(
OUTPUT gudhi.so
@@ -371,37 +376,47 @@ if(PYTHONINTERP_FOUND)
# Reader utils
add_gudhi_py_test(test_reader_utils)
+ # Wasserstein
+ if(OT_FOUND)
+ add_gudhi_py_test(test_wasserstein_distance)
+ endif(OT_FOUND)
+
# Documentation generation is available through sphinx - requires all modules
if(SPHINX_PATH)
if(MATPLOTLIB_FOUND)
if(NUMPY_FOUND)
if(SCIPY_FOUND)
- if(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
- set (GUDHI_SPHINX_MESSAGE "Generating API documentation with Sphinx in ${CMAKE_CURRENT_BINARY_DIR}/sphinx/")
- # User warning - Sphinx is a static pages generator, and configured to work fine with user_version
- # Images and biblio warnings because not found on developper version
- if (GUDHI_PYTHON_PATH STREQUAL "src/python")
- set (GUDHI_SPHINX_MESSAGE "${GUDHI_SPHINX_MESSAGE} \n WARNING : Sphinx is configured for user version, you run it on developper version. Images and biblio will miss")
- endif()
- # sphinx target requires gudhi.so, because conf.py reads gudhi version from it
- add_custom_target(sphinx
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc
- COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}"
- ${SPHINX_PATH} -b html ${CMAKE_CURRENT_SOURCE_DIR}/doc ${CMAKE_CURRENT_BINARY_DIR}/sphinx
- DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/gudhi.so"
- COMMENT "${GUDHI_SPHINX_MESSAGE}" VERBATIM)
-
- add_test(NAME sphinx_py_test
- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}"
- ${SPHINX_PATH} -b doctest ${CMAKE_CURRENT_SOURCE_DIR}/doc ${CMAKE_CURRENT_BINARY_DIR}/doctest)
-
- # Set missing or not modules
- set(GUDHI_MODULES ${GUDHI_MODULES} "python-documentation" CACHE INTERNAL "GUDHI_MODULES")
- else(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
- message("++ Python documentation module will not be compiled because it requires a Eigen3 and CGAL version >= 4.11.0")
+ if(OT_FOUND)
+ if(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
+ set (GUDHI_SPHINX_MESSAGE "Generating API documentation with Sphinx in ${CMAKE_CURRENT_BINARY_DIR}/sphinx/")
+ # User warning - Sphinx is a static pages generator, and configured to work fine with user_version
+ # Images and biblio warnings because not found on developper version
+ if (GUDHI_PYTHON_PATH STREQUAL "src/python")
+ set (GUDHI_SPHINX_MESSAGE "${GUDHI_SPHINX_MESSAGE} \n WARNING : Sphinx is configured for user version, you run it on developper version. Images and biblio will miss")
+ endif()
+ # sphinx target requires gudhi.so, because conf.py reads gudhi version from it
+ add_custom_target(sphinx
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doc
+ COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}"
+ ${SPHINX_PATH} -b html ${CMAKE_CURRENT_SOURCE_DIR}/doc ${CMAKE_CURRENT_BINARY_DIR}/sphinx
+ DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/gudhi.so"
+ COMMENT "${GUDHI_SPHINX_MESSAGE}" VERBATIM)
+
+ add_test(NAME sphinx_py_test
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ COMMAND ${CMAKE_COMMAND} -E env "PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}"
+ ${SPHINX_PATH} -b doctest ${CMAKE_CURRENT_SOURCE_DIR}/doc ${CMAKE_CURRENT_BINARY_DIR}/doctest)
+
+ # Set missing or not modules
+ set(GUDHI_MODULES ${GUDHI_MODULES} "python-documentation" CACHE INTERNAL "GUDHI_MODULES")
+ else(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
+ message("++ Python documentation module will not be compiled because it requires a Eigen3 and CGAL version >= 4.11.0")
+ set(GUDHI_MISSING_MODULES ${GUDHI_MISSING_MODULES} "python-documentation" CACHE INTERNAL "GUDHI_MISSING_MODULES")
+ endif(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
+ else(OT_FOUND)
+ message("++ Python documentation module will not be compiled because POT was not found")
set(GUDHI_MISSING_MODULES ${GUDHI_MISSING_MODULES} "python-documentation" CACHE INTERNAL "GUDHI_MISSING_MODULES")
- endif(NOT CGAL_WITH_EIGEN3_VERSION VERSION_LESS 4.11.0)
+ endif(OT_FOUND)
else(SCIPY_FOUND)
message("++ Python documentation module will not be compiled because scipy was not found")
set(GUDHI_MISSING_MODULES ${GUDHI_MISSING_MODULES} "python-documentation" CACHE INTERNAL "GUDHI_MISSING_MODULES")
diff --git a/src/python/doc/index.rst b/src/python/doc/index.rst
index e379bc23..16d918bc 100644
--- a/src/python/doc/index.rst
+++ b/src/python/doc/index.rst
@@ -73,6 +73,11 @@ Bottleneck distance
.. include:: bottleneck_distance_sum.inc
+Wasserstein distance
+===================
+
+.. include:: wasserstein_distance_sum.inc
+
Persistence graphical tools
===========================
diff --git a/src/python/doc/installation.rst b/src/python/doc/installation.rst
index 5369efb0..7699a5bb 100644
--- a/src/python/doc/installation.rst
+++ b/src/python/doc/installation.rst
@@ -215,12 +215,20 @@ The following examples require the `Matplotlib <http://matplotlib.org>`_:
* :download:`euclidean_strong_witness_complex_diagram_persistence_from_off_file_example.py <../example/euclidean_strong_witness_complex_diagram_persistence_from_off_file_example.py>`
* :download:`euclidean_witness_complex_diagram_persistence_from_off_file_example.py <../example/euclidean_witness_complex_diagram_persistence_from_off_file_example.py>`
+Python Optimal Transport
+========================
+
+The :doc:`Wasserstein distance </wasserstein_distance_user>`
+module requires `POT <https://pot.readthedocs.io/>`_, a library that provides
+several solvers for optimization problems related to Optimal Transport.
+
SciPy
=====
-The :doc:`persistence graphical tools </persistence_graphical_tools_user>`
-module requires `SciPy <http://scipy.org>`_, a Python-based ecosystem of
-open-source software for mathematics, science, and engineering.
+The :doc:`persistence graphical tools </persistence_graphical_tools_user>` and
+:doc:`Wasserstein distance </wasserstein_distance_user>` modules require `SciPy
+<http://scipy.org>`_, a Python-based ecosystem of open-source software for
+mathematics, science, and engineering.
Threading Building Blocks
=========================
diff --git a/src/python/doc/wasserstein_distance_sum.inc b/src/python/doc/wasserstein_distance_sum.inc
new file mode 100644
index 00000000..ffd4d312
--- /dev/null
+++ b/src/python/doc/wasserstein_distance_sum.inc
@@ -0,0 +1,14 @@
+.. table::
+ :widths: 30 50 20
+
+ +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+
+ | .. figure:: | The p-Wasserstein distance measures the similarity between two | :Author: Theo Lacombe |
+ | ../../doc/Bottleneck_distance/perturb_pd.png | persistence diagrams. It's the minimum value c that can be achieved | |
+ | :figclass: align-center | by a perfect matching between the points of the two diagrams (+ all | :Introduced in: GUDHI 3.1.0 |
+ | | diagonal points), where the value of a matching is defined as the | |
+ | Wasserstein distance is the p-th root of the sum of the | p-th root of the sum of all edge lengths to the power p. Edge lengths| :Copyright: MIT |
+ | edge lengths to the power p. | are measured in norm q, for :math:`1 \leq q \leq \infty`. | |
+ | | | :Requires: Python Optimal Transport (POT) :math:`\geq` 0.5.1 |
+ +-----------------------------------------------------------------+----------------------------------------------------------------------+------------------------------------------------------------------+
+ | * :doc:`wasserstein_distance_user` | |
+ +-----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/src/python/doc/wasserstein_distance_user.rst b/src/python/doc/wasserstein_distance_user.rst
new file mode 100644
index 00000000..bcb7f19d
--- /dev/null
+++ b/src/python/doc/wasserstein_distance_user.rst
@@ -0,0 +1,40 @@
+:orphan:
+
+.. To get rid of WARNING: document isn't included in any toctree
+
+Wasserstein distance user manual
+===============================
+Definition
+----------
+
+.. include:: wasserstein_distance_sum.inc
+
+This implementation is based on ideas from "Large Scale Computation of Means and Cluster for Persistence Diagrams via Optimal Transport".
+
+Function
+--------
+.. autofunction:: gudhi.wasserstein_distance
+
+
+Basic example
+-------------
+
+This example computes the 1-Wasserstein distance from 2 persistence diagrams with euclidean ground metric.
+Note that persistence diagrams must be submitted as (n x 2) numpy arrays and must not contain inf values.
+
+.. testcode::
+
+ import gudhi
+ import numpy as np
+
+ diag1 = np.array([[2.7, 3.7],[9.6, 14.],[34.2, 34.974]])
+ diag2 = np.array([[2.8, 4.45],[9.5, 14.1]])
+
+ message = "Wasserstein distance value = " + '%.2f' % gudhi.wasserstein_distance(diag1, diag2, q=2., p=1.)
+ print(message)
+
+The output is:
+
+.. testoutput::
+
+ Wasserstein distance value = 1.45
diff --git a/src/python/gudhi/wasserstein.py b/src/python/gudhi/wasserstein.py
new file mode 100644
index 00000000..eba7c6d5
--- /dev/null
+++ b/src/python/gudhi/wasserstein.py
@@ -0,0 +1,99 @@
+import numpy as np
+import scipy.spatial.distance as sc
+try:
+ import ot
+except ImportError:
+ print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT")
+
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+ See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+ Author(s): Theo Lacombe
+
+ Copyright (C) 2019 Inria
+
+ Modification(s):
+ - YYYY/MM Author: Description of the modification
+"""
+
+def _proj_on_diag(X):
+ '''
+ :param X: (n x 2) array encoding the points of a persistent diagram.
+ :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal
+ '''
+ Z = (X[:,0] + X[:,1]) / 2.
+ return np.array([Z , Z]).T
+
+
+def _build_dist_matrix(X, Y, p=2., q=2.):
+ '''
+ :param X: (n x 2) numpy.array encoding the (points of the) first diagram.
+ :param Y: (m x 2) numpy.array encoding the second diagram.
+ :param q: Ground metric (i.e. norm l_q).
+ :param p: exponent for the Wasserstein metric.
+ :returns: (n+1) x (m+1) np.array encoding the cost matrix C.
+ For 1 <= i <= n, 1 <= j <= m, C[i,j] encodes the distance between X[i] and Y[j], while C[i, m+1] (resp. C[n+1, j]) encodes the distance (to the p) between X[i] (resp Y[j]) and its orthogonal proj onto the diagonal.
+ note also that C[n+1, m+1] = 0 (it costs nothing to move from the diagonal to the diagonal).
+ '''
+ Xdiag = _proj_on_diag(X)
+ Ydiag = _proj_on_diag(Y)
+ if np.isinf(q):
+ C = sc.cdist(X,Y, metric='chebyshev')**p
+ Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p
+ Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p
+ else:
+ C = sc.cdist(X,Y, metric='minkowski', p=q)**p
+ Cxd = np.linalg.norm(X - Xdiag, ord=q, axis=1)**p
+ Cdy = np.linalg.norm(Y - Ydiag, ord=q, axis=1)**p
+ Cf = np.hstack((C, Cxd[:,None]))
+ Cdy = np.append(Cdy, 0)
+
+ Cf = np.vstack((Cf, Cdy[None,:]))
+
+ return Cf
+
+
+def _perstot(X, p, q):
+ '''
+ :param X: (n x 2) numpy.array (points of a given diagram).
+ :param q: Ground metric on the (upper-half) plane (i.e. norm l_q in R^2); Default value is 2 (Euclidean norm).
+ :param p: exponent for Wasserstein; Default value is 2.
+ :returns: float, the total persistence of the diagram (that is, its distance to the empty diagram).
+ '''
+ Xdiag = _proj_on_diag(X)
+ return (np.sum(np.linalg.norm(X - Xdiag, ord=q, axis=1)**p))**(1./p)
+
+
+def wasserstein_distance(X, Y, p=2., q=2.):
+ '''
+ :param X: (n x 2) numpy.array encoding the (finite points of the) first diagram. Must not contain essential points (i.e. with infinite coordinate).
+ :param Y: (m x 2) numpy.array encoding the second diagram.
+ :param q: Ground metric on the (upper-half) plane (i.e. norm l_q in R^2); Default value is 2 (euclidean norm).
+ :param p: exponent for Wasserstein; Default value is 2.
+ :returns: the p-Wasserstein distance (1 <= p < infinity) with respect to the q-norm as ground metric.
+ :rtype: float
+ '''
+ n = len(X)
+ m = len(Y)
+
+ # handle empty diagrams
+ if X.size == 0:
+ if Y.size == 0:
+ return 0.
+ else:
+ return _perstot(Y, p, q)
+ elif Y.size == 0:
+ return _perstot(X, p, q)
+
+ M = _build_dist_matrix(X, Y, p=p, q=q)
+ a = np.full(n+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here.
+ a[-1] = a[-1] * m # normalized so that we have a probability measure, required by POT
+ b = np.full(m+1, 1. / (n + m) ) # weight vector of the input diagram. Uniform here.
+ b[-1] = b[-1] * n # so that we have a probability measure, required by POT
+
+ # Comptuation of the otcost using the ot.emd2 library.
+ # Note: it is the squared Wasserstein distance.
+ # The default numItermax=100000 is not sufficient for some examples with 5000 points, what is a good value?
+ ot_cost = (n+m) * ot.emd2(a, b, M, numItermax=2000000)
+
+ return ot_cost ** (1./p)
+
diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py
new file mode 100755
index 00000000..c1b568e2
--- /dev/null
+++ b/src/python/test/test_wasserstein_distance.py
@@ -0,0 +1,50 @@
+import gudhi
+import numpy as np
+
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+ See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+ Author(s): Theo Lacombe
+
+ Copyright (C) 2019 Inria
+
+ Modification(s):
+ - YYYY/MM Author: Description of the modification
+"""
+
+__author__ = "Theo Lacombe"
+__copyright__ = "Copyright (C) 2019 Inria"
+__license__ = "MIT"
+
+
+def test_basic_wasserstein():
+ diag1 = np.array([[2.7, 3.7], [9.6, 14.0], [34.2, 34.974]])
+ diag2 = np.array([[2.8, 4.45], [9.5, 14.1]])
+ diag3 = np.array([[0, 2], [4, 6]])
+ diag4 = np.array([[0, 3], [4, 8]])
+ emptydiag = np.array([[]])
+
+ assert gudhi.wasserstein_distance(emptydiag, emptydiag, q=2., p=1.) == 0.
+ assert gudhi.wasserstein_distance(emptydiag, emptydiag, q=np.inf, p=1.) == 0.
+ assert gudhi.wasserstein_distance(emptydiag, emptydiag, q=np.inf, p=2.) == 0.
+ assert gudhi.wasserstein_distance(emptydiag, emptydiag, q=2., p=2.) == 0.
+
+ assert gudhi.wasserstein_distance(diag3, emptydiag, q=np.inf, p=1.) == 2.
+ assert gudhi.wasserstein_distance(diag3, emptydiag, q=1., p=1.) == 4.
+
+ assert gudhi.wasserstein_distance(diag4, emptydiag, q=1., p=2.) == 5. # thank you Pythagorician triplets
+ assert gudhi.wasserstein_distance(diag4, emptydiag, q=np.inf, p=2.) == 2.5
+ assert gudhi.wasserstein_distance(diag4, emptydiag, q=2., p=2.) == 3.5355339059327378
+
+ assert gudhi.wasserstein_distance(diag1, diag2, q=2., p=1.) == 1.4453593023967701
+ assert gudhi.wasserstein_distance(diag1, diag2, q=2.35, p=1.74) == 0.9772734057168739
+
+ assert gudhi.wasserstein_distance(diag1, emptydiag, q=2.35, p=1.7863) == 3.141592214572228
+
+ assert gudhi.wasserstein_distance(diag3, diag4, q=1., p=1.) == 3.
+ assert gudhi.wasserstein_distance(diag3, diag4, q=np.inf, p=1.) == 3. # no diag matching here
+ assert gudhi.wasserstein_distance(diag3, diag4, q=np.inf, p=2.) == np.sqrt(5)
+ assert gudhi.wasserstein_distance(diag3, diag4, q=1., p=2.) == np.sqrt(5)
+ assert gudhi.wasserstein_distance(diag3, diag4, q=4.5, p=2.) == np.sqrt(5)
+
+
+