From c8c942c43643131a7ef9899826a7095e497150fe Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 22:10:26 +0100 Subject: cmake --- .../modules/GUDHI_third_party_libraries.cmake | 3 + src/python/CMakeLists.txt | 14 ++ src/python/gudhi/point_cloud/dtm.py | 40 +++++ src/python/gudhi/point_cloud/knn.py | 193 +++++++++++++++++++++ src/python/test/test_dtm.py | 32 ++++ 5 files changed, 282 insertions(+) create mode 100644 src/python/gudhi/point_cloud/dtm.py create mode 100644 src/python/gudhi/point_cloud/knn.py create mode 100755 src/python/test/test_dtm.py diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake index 2d010483..c2039674 100644 --- a/src/cmake/modules/GUDHI_third_party_libraries.cmake +++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake @@ -160,6 +160,9 @@ if( PYTHONINTERP_FOUND ) find_python_module("sklearn") find_python_module("ot") find_python_module("pybind11") + find_python_module("torch") + find_python_module("hnswlib") + find_python_module("pykeops") endif() if(NOT GUDHI_PYTHON_PATH) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index f00966a5..d26d3e6e 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -78,6 +78,15 @@ if(PYTHONINTERP_FOUND) if(OT_FOUND) add_gudhi_debug_info("POT version ${OT_VERSION}") endif() + if(HNSWLIB_FOUND) + add_gudhi_debug_info("HNSWlib version ${OT_VERSION}") + endif() + if(TORCH_FOUND) + add_gudhi_debug_info("PyTorch version ${OT_VERSION}") + endif() + if(PYKEOPS_FOUND) + add_gudhi_debug_info("PyKeOps version ${OT_VERSION}") + endif() set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ") set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ") @@ -399,6 +408,11 @@ if(PYTHONINTERP_FOUND) # Time Delay add_gudhi_py_test(test_time_delay) + # DTM + if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND) + add_gudhi_py_test(test_dtm) + endif() + # Documentation generation is available through sphinx - requires all modules if(SPHINX_PATH) if(MATPLOTLIB_FOUND) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py new file mode 100644 index 00000000..08f9ea60 --- /dev/null +++ b/src/python/gudhi/point_cloud/dtm.py @@ -0,0 +1,40 @@ +from .knn import KNN + + +class DTM: + def __init__(self, k, q=2, **kwargs): + """ + Args: + q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. + kwargs: Same parameters as KNN, except that metric="neighbors" means that transform() expects an array with the distances to the k nearest neighbors. + """ + self.k = k + self.q = q + self.params = kwargs + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for mass points + """ + if self.params.setdefault("metric", "euclidean") != "neighbors": + self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) + self.knn.fit(X) + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). + """ + if self.params["metric"] == "neighbors": + distances = X[:, : self.k] + else: + distances = self.knn.transform(X) + distances = distances ** self.q + dtm = distances.sum(-1) / self.k + dtm = dtm ** (1.0 / self.q) + return dtm diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py new file mode 100644 index 00000000..57078f1e --- /dev/null +++ b/src/python/gudhi/point_cloud/knn.py @@ -0,0 +1,193 @@ +import numpy + + +class KNN: + def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): + """ + Args: + k (int): number of neighbors (including the point itself). + return_index (bool): if True, return the index of each neighbor. + return_distance (bool): if True, return the distance to each neighbor. + implementation (str): Choice of the library that does the real work. + + * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes + large (10+) but the number of points remains low (less than a million). + Only "minkowski" and its aliases are supported. + * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. + * 'sklearn' for scikit-learn's NearestNeighbors. + Note that this provides in particular an option algorithm="brute". + * 'hnsw' for hnswlib.Index. It is very fast but does not provide guarantees. + Only supports "euclidean" for now. + * None will try to select a sensible one (scipy if possible, scikit-learn otherwise). + metric (str): see `sklearn.neighbors.NearestNeighbors`. + eps (float): relative error when computing nearest neighbors with the cKDTree. + p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. + n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. + If -1 is given all processors are used. Default: 1. + + Additional parameters are forwarded to the backends. + """ + self.k = k + self.return_index = return_index + self.return_distance = return_distance + self.metric = metric + self.params = kwargs + # canonicalize + if metric == "euclidean": + self.params["p"] = 2 + self.metric = "minkowski" + elif metric == "manhattan": + self.params["p"] = 1 + self.metric = "minkowski" + elif metric == "chebyshev": + self.params["p"] = numpy.inf + self.metric = "minkowski" + elif metric == "minkowski": + self.params["p"] = kwargs.get("p", 2) + if self.params.get("implementation") in {"keops", "ckdtree"}: + assert self.metric == "minkowski" + if self.params.get("implementation") == "hnsw": + assert self.metric == "minkowski" and self.params["p"] == 2 + if not self.params.get("implementation"): + if self.metric == "minkowski": + self.params["implementation"] = "ckdtree" + else: + self.params["implementation"] = "sklearn" + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for reference points + """ + self.ref_points = X + if self.params.get("implementation") == "ckdtree": + # sklearn could handle this, but it is much slower + from scipy.spatial import cKDTree + self.kdtree = cKDTree(X) + + if self.params.get("implementation") == "sklearn" and self.metric != "precomputed": + # FIXME: sklearn badly handles "precomputed" + from sklearn.neighbors import NearestNeighbors + + nargs = {k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}} + self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) + self.nn.fit(X) + + if self.params.get("implementation") == "hnsw": + import hnswlib + self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances + self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) + n = self.params.get("num_threads") + if n is None: + n = self.params.get("n_jobs", 1) + self.params["num_threads"] = n + self.graph.add_items(X, num_threads=n) + + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed" + """ + metric = self.metric + k = self.k + + if metric == "precomputed": + # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural. + X = numpy.array(X) + if self.return_index: + neighbors = numpy.argpartition(X, k - 1)[:, 0:k] + distances = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(distances, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + if self.return_distance: + distances = numpy.take_along_axis(distances, ngb_order, axis=-1) + return neighbors, distances + else: + return neighbors + if self.return_distance: + distances = numpy.partition(X, k - 1)[:, 0:k] + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) + return distances + return None + + if self.params.get("implementation") == "hnsw": + ef = self.params.get("ef") + if ef is not None: + self.graph.set_ef(ef) + neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"]) + # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn, + # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue. + if self.return_index: + if self.return_distance: + return neighbors, numpy.sqrt(distances) + else: + return neighbors + if self.return_distance: + return numpy.sqrt(distances) + return None + + if self.params.get("implementation") == "keops": + import torch + from pykeops.torch import LazyTensor + + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.tensor(X, dtype=torch.float32) + if X is self.ref_points: + YY = XX + else: + YY = torch.tensor(self.ref_points, dtype=torch.float32) + + p = self.params["p"] + if p == numpy.inf: + # Requires a version of pykeops strictly more recent than 1.3 + mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1) + elif p == 2: # Any even integer? + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1) + else: + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) + + if self.return_index: + if self.return_distance: + distances, neighbors = mat.Kmin_argKmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return neighbors, distances + else: + neighbors = mat.argKmin(k, dim=1) + return neighbors + if self.return_distance: + distances = mat.Kmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return distances + return None + # FIXME: convert everything back to numpy arrays or not? + + if hasattr(self, "kdtree"): + qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} + distances, neighbors = self.kdtree.query(X, k=self.k, **qargs) + if self.return_index: + if self.return_distance: + return neighbors, distances + else: + return neighbors + if self.return_distance: + return distances + return None + + if self.return_distance: + distances, neighbors = self.nn.kneighbors(X, return_distance=True) + if self.return_index: + return neighbors, distances + else: + return distances + if self.return_index: + neighbors = self.nn.kneighbors(X, return_distance=False) + return neighbors + return None diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py new file mode 100755 index 00000000..57fdd131 --- /dev/null +++ b/src/python/test/test_dtm.py @@ -0,0 +1,32 @@ +""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + Author(s): Marc Glisse + + Copyright (C) 2020 Inria + + Modification(s): + - YYYY/MM Author: Description of the modification +""" + +from gudhi.point_cloud.dtm import DTM +import numpy + + +def test_dtm_euclidean(): + pts = numpy.random.rand(1000,4) + k = 3 + dtm = DTM(k,implementation="ckdtree") + print(dtm.fit_transform(pts)) + dtm = DTM(k,implementation="sklearn") + print(dtm.fit_transform(pts)) + dtm = DTM(k,implementation="sklearn",algorithm="brute") + print(dtm.fit_transform(pts)) + dtm = DTM(k,implementation="hnsw") + print(dtm.fit_transform(pts)) + from scipy.spatial.distance import cdist + d = cdist(pts,pts) + dtm = DTM(k,metric="precomputed") + print(dtm.fit_transform(d)) + dtm = DTM(k,implementation="keops") + print(dtm.fit_transform(pts)) + -- cgit v1.2.3 From 5c4c398b99fe1b157d64cd43a4977ce1504ca795 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 22:25:28 +0100 Subject: HNSWlib doesn't define __version__ --- src/cmake/modules/GUDHI_third_party_libraries.cmake | 21 ++++++++++++++++++++- src/python/CMakeLists.txt | 7 ++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake index c2039674..a931b3a1 100644 --- a/src/cmake/modules/GUDHI_third_party_libraries.cmake +++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake @@ -150,6 +150,25 @@ function( find_python_module PYTHON_MODULE_NAME ) endif() endfunction( find_python_module ) +# For modules that do not define module.__version__ +function( find_python_module_no_version PYTHON_MODULE_NAME ) + string(TOUPPER ${PYTHON_MODULE_NAME} PYTHON_MODULE_NAME_UP) + execute_process( + COMMAND ${PYTHON_EXECUTABLE} -c "import ${PYTHON_MODULE_NAME}" + RESULT_VARIABLE PYTHON_MODULE_RESULT + ERROR_VARIABLE PYTHON_MODULE_ERROR) + if(PYTHON_MODULE_RESULT EQUAL 0) + # Remove carriage return + message ("++ Python module ${PYTHON_MODULE_NAME} found") + set(${PYTHON_MODULE_NAME_UP}_FOUND TRUE PARENT_SCOPE) + else() + message ("PYTHON_MODULE_NAME = ${PYTHON_MODULE_NAME} + - PYTHON_MODULE_RESULT = ${PYTHON_MODULE_RESULT} + - PYTHON_MODULE_ERROR = ${PYTHON_MODULE_ERROR}") + set(${PYTHON_MODULE_NAME_UP}_FOUND FALSE PARENT_SCOPE) + endif() +endfunction( find_python_module_no_version ) + if( PYTHONINTERP_FOUND ) find_python_module("cython") find_python_module("pytest") @@ -161,8 +180,8 @@ if( PYTHONINTERP_FOUND ) find_python_module("ot") find_python_module("pybind11") find_python_module("torch") - find_python_module("hnswlib") find_python_module("pykeops") + find_python_module_no_version("hnswlib") endif() if(NOT GUDHI_PYTHON_PATH) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index d26d3e6e..ec0ab1ca 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -79,13 +79,14 @@ if(PYTHONINTERP_FOUND) add_gudhi_debug_info("POT version ${OT_VERSION}") endif() if(HNSWLIB_FOUND) - add_gudhi_debug_info("HNSWlib version ${OT_VERSION}") + # Does not have a version number... + add_gudhi_debug_info("HNSWlib found") endif() if(TORCH_FOUND) - add_gudhi_debug_info("PyTorch version ${OT_VERSION}") + add_gudhi_debug_info("PyTorch version ${TORCH_VERSION}") endif() if(PYKEOPS_FOUND) - add_gudhi_debug_info("PyKeOps version ${OT_VERSION}") + add_gudhi_debug_info("PyKeOps version ${PYKEOPS_VERSION}") endif() set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ") -- cgit v1.2.3 From 7ddad8220fdd34fd3ed91e16882feaa3961b2d67 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 22:59:20 +0100 Subject: license --- src/python/gudhi/point_cloud/dtm.py | 9 +++++++++ src/python/gudhi/point_cloud/knn.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 08f9ea60..839e7452 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -1,3 +1,12 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + from .knn import KNN diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 57078f1e..943d4e9f 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -1,3 +1,12 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + import numpy -- cgit v1.2.3 From 7120b186471828a9570fdeef37900bd8b98d0d31 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 23:06:06 +0100 Subject: license --- src/python/doc/point_cloud_sum.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/doc/point_cloud_sum.inc b/src/python/doc/point_cloud_sum.inc index 0a159680..ecc18951 100644 --- a/src/python/doc/point_cloud_sum.inc +++ b/src/python/doc/point_cloud_sum.inc @@ -6,7 +6,7 @@ | | :math:`(y_1, y_2, \ldots, y_d)` | | | | | | :Since: GUDHI 2.0.0 | | | | | - | | | :License: MIT (`GPL v3 `_) | + | | | :License: MIT (`GPL v3 `_, BSD-3-Clause, Apache-2.0) | | | Parts of this package require CGAL. | | | | | :Requires: `Eigen `__ :math:`\geq` 3.1.0 and `CGAL `__ :math:`\geq` 4.11.0 | | | | | -- cgit v1.2.3 From af35ea5b4ce631ae826f1db1940798f254aba658 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 23:39:59 +0100 Subject: clean-up use of "implementation" --- src/python/gudhi/point_cloud/knn.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 943d4e9f..a4ea3acd 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -72,12 +72,12 @@ class KNN: X (numpy.array): coordinates for reference points """ self.ref_points = X - if self.params.get("implementation") == "ckdtree": + if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree self.kdtree = cKDTree(X) - if self.params.get("implementation") == "sklearn" and self.metric != "precomputed": + if self.params["implementation"] == "sklearn" and self.metric != "precomputed": # FIXME: sklearn badly handles "precomputed" from sklearn.neighbors import NearestNeighbors @@ -85,7 +85,7 @@ class KNN: self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) self.nn.fit(X) - if self.params.get("implementation") == "hnsw": + if self.params["implementation"] == "hnsw": import hnswlib self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) @@ -125,7 +125,7 @@ class KNN: return distances return None - if self.params.get("implementation") == "hnsw": + if self.params["implementation"] == "hnsw": ef = self.params.get("ef") if ef is not None: self.graph.set_ef(ef) @@ -141,7 +141,7 @@ class KNN: return numpy.sqrt(distances) return None - if self.params.get("implementation") == "keops": + if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor @@ -178,7 +178,7 @@ class KNN: return None # FIXME: convert everything back to numpy arrays or not? - if hasattr(self, "kdtree"): + if self.params["implementation"] == "ckdtree": qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} distances, neighbors = self.kdtree.query(X, k=self.k, **qargs) if self.return_index: @@ -190,6 +190,7 @@ class KNN: return distances return None + assert self.params["implementation"] == "sklearn" if self.return_distance: distances, neighbors = self.nn.kneighbors(X, return_distance=True) if self.return_index: -- cgit v1.2.3 From f74c71ca8e474ff927cae029ea63329d30293582 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 27 Mar 2020 13:43:58 +0100 Subject: Improve coverage --- src/python/gudhi/point_cloud/dtm.py | 2 ++ src/python/test/test_dtm.py | 48 +++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 839e7452..541b74a6 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -30,6 +30,8 @@ class DTM: X (numpy.array): coordinates for mass points """ if self.params.setdefault("metric", "euclidean") != "neighbors": + # KNN gives sorted distances, which is unnecessary here. + # Maybe add a parameter to say we don't need sorting? self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) self.knn.fit(X) return self diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 57fdd131..841f8c3c 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -10,23 +10,41 @@ from gudhi.point_cloud.dtm import DTM import numpy +import pytest -def test_dtm_euclidean(): - pts = numpy.random.rand(1000,4) +def test_dtm_compare_euclidean(): + pts = numpy.random.rand(1000, 4) k = 3 - dtm = DTM(k,implementation="ckdtree") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="sklearn") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="sklearn",algorithm="brute") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="hnsw") - print(dtm.fit_transform(pts)) + dtm = DTM(k, implementation="ckdtree") + r0 = dtm.fit_transform(pts) + dtm = DTM(k, implementation="sklearn") + r1 = dtm.fit_transform(pts) + assert r1 == pytest.approx(r0) + dtm = DTM(k, implementation="sklearn", algorithm="brute") + r2 = dtm.fit_transform(pts) + assert r2 == pytest.approx(r0) + dtm = DTM(k, implementation="hnsw") + r3 = dtm.fit_transform(pts) + assert r3 == pytest.approx(r0) from scipy.spatial.distance import cdist - d = cdist(pts,pts) - dtm = DTM(k,metric="precomputed") - print(dtm.fit_transform(d)) - dtm = DTM(k,implementation="keops") - print(dtm.fit_transform(pts)) + d = cdist(pts, pts) + dtm = DTM(k, metric="precomputed") + r4 = dtm.fit_transform(d) + assert r4 == pytest.approx(r0) + dtm = DTM(k, implementation="keops") + r5 = dtm.fit_transform(pts) + assert r5 == pytest.approx(r0) + + +def test_dtm_precomputed(): + dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]]) + dtm = DTM(2, q=1, metric="neighbors") + r = dtm.fit_transform(dist) + assert r == pytest.approx([2.0, 3, 1]) + + dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) + dtm = DTM(2, q=2, metric="neighbors") + r = dtm.fit_transform(dist) + assert r == pytest.approx([2.0, .707, 3.5355], rel=.01) -- cgit v1.2.3 From 03376ffe0f6060864ee8908893297f8800b7b8d1 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 27 Mar 2020 20:27:10 +0100 Subject: doc --- src/python/doc/point_cloud.rst | 17 +++++++++++++++-- src/python/gudhi/point_cloud/dtm.py | 6 +++++- src/python/gudhi/point_cloud/knn.py | 31 ++++++++++++++++++------------- src/python/test/test_dtm.py | 2 +- 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst index c0d4b303..351b0786 100644 --- a/src/python/doc/point_cloud.rst +++ b/src/python/doc/point_cloud.rst @@ -21,10 +21,23 @@ Subsampling :special-members: :show-inheritance: -TimeDelayEmbedding ------------------- +Time Delay Embedding +-------------------- .. autoclass:: gudhi.point_cloud.timedelay.TimeDelayEmbedding :members: :special-members: __call__ +Nearest neighbors +----------------- + +.. automodule:: gudhi.point_cloud.knn + :members: + :special-members: __init__ + +Distance to measure +------------------- + +.. automodule:: gudhi.point_cloud.dtm + :members: + :special-members: __init__ diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 541b74a6..e4096c5e 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -11,11 +11,15 @@ from .knn import KNN class DTM: + """ + Class to compute the distance to the empirical measure defined by a point set. + """ + def __init__(self, k, q=2, **kwargs): """ Args: q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. - kwargs: Same parameters as KNN, except that metric="neighbors" means that transform() expects an array with the distances to the k nearest neighbors. + kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index a4ea3acd..02448530 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,10 @@ import numpy class KNN: + """ + Class wrapping several implementations for computing the k nearest neighbors in a point set. + """ + def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): """ Args: @@ -19,22 +23,17 @@ class KNN: return_distance (bool): if True, return the distance to each neighbor. implementation (str): Choice of the library that does the real work. - * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes - large (10+) but the number of points remains low (less than a million). - Only "minkowski" and its aliases are supported. + * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported. * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. - * 'sklearn' for scikit-learn's NearestNeighbors. - Note that this provides in particular an option algorithm="brute". - * 'hnsw' for hnswlib.Index. It is very fast but does not provide guarantees. - Only supports "euclidean" for now. + * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute". + * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now. * None will try to select a sensible one (scipy if possible, scikit-learn otherwise). metric (str): see `sklearn.neighbors.NearestNeighbors`. eps (float): relative error when computing nearest neighbors with the cKDTree. p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. - - Additional parameters are forwarded to the backends. + kwargs: additional parameters are forwarded to the backends. """ self.k = k self.return_index = return_index @@ -75,20 +74,26 @@ class KNN: if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree + self.kdtree = cKDTree(X) if self.params["implementation"] == "sklearn" and self.metric != "precomputed": # FIXME: sklearn badly handles "precomputed" from sklearn.neighbors import NearestNeighbors - nargs = {k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}} + nargs = { + k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"} + } self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) self.nn.fit(X) if self.params["implementation"] == "hnsw": import hnswlib - self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances - self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) + + self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances + self.graph.init_index( + len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}} + ) n = self.params.get("num_threads") if n is None: n = self.params.get("n_jobs", 1) @@ -154,7 +159,7 @@ class KNN: p = self.params["p"] if p == numpy.inf: - # Requires a version of pykeops strictly more recent than 1.3 + # Requires pykeops 1.4 or later mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1) elif p == 2: # Any even integer? mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 841f8c3c..93b13e1a 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -47,4 +47,4 @@ def test_dtm_precomputed(): dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) dtm = DTM(2, q=2, metric="neighbors") r = dtm.fit_transform(dist) - assert r == pytest.approx([2.0, .707, 3.5355], rel=.01) + assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01) -- cgit v1.2.3 From 68839b95e7751afd04155cd2565cc53362f01fa2 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 10:41:50 +0100 Subject: Missing test --- src/python/CMakeLists.txt | 1 + src/python/test/test_knn.py | 82 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100755 src/python/test/test_knn.py diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index ec0ab1ca..d7a6a4db 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -411,6 +411,7 @@ if(PYTHONINTERP_FOUND) # DTM if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND) + add_gudhi_py_test(test_knn) add_gudhi_py_test(test_dtm) endif() diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py new file mode 100755 index 00000000..e455fb48 --- /dev/null +++ b/src/python/test/test_knn.py @@ -0,0 +1,82 @@ +""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + Author(s): Marc Glisse + + Copyright (C) 2020 Inria + + Modification(s): + - YYYY/MM Author: Description of the modification +""" + +from gudhi.point_cloud.knn import KNN +import numpy as np +import pytest + + +def test_knn_explicit(): + base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) + query = np.array([[1.0, 1], [2, 2], [4, 4]]) + knn = KNN(2, metric="manhattan", return_distance=True, return_index=True) + knn.fit(base) + r = knn.transform(query) + assert r[0] == pytest.approx(np.array([[0, 1], [1, 0], [3, 2]])) + assert r[1] == pytest.approx(np.array([[0.0, 1], [1, 2], [1, 2]])) + + knn = KNN(2, metric="chebyshev", return_distance=True, return_index=False) + knn.fit(base) + r = knn.transform(query) + assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) + r = ( + KNN(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops") + .fit(base) + .transform(query) + ) + assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) + + knn = KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True) + knn.fit(base) + r = knn.transform(query) + assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) + r = ( + KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops") + .fit(base) + .transform(query) + ) + assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) + + dist = np.array([[0.0, 3, 8], [1, 0, 5], [1, 2, 0]]) + knn = KNN(2, metric="precomputed", return_index=True, return_distance=False) + r = knn.fit_transform(dist) + assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) + knn = KNN(2, metric="precomputed", return_index=True, return_distance=True) + r = knn.fit_transform(dist) + assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) + assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) + + +def test_knn_compare(): + base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) + query = np.array([[1.0, 1], [2, 2], [4, 4]]) + r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=False).fit(base).transform(query) + r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=False).fit(base).transform(query) + r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query) + r3 = KNN(2, implementation="keops", return_index=True, return_distance=False).fit(base).transform(query) + assert np.array_equal(r0, r1) and np.array_equal(r0, r2) and np.array_equal(r0, r3) + + r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=True).fit(base).transform(query) + r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=True).fit(base).transform(query) + r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query) + r3 = KNN(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query) + assert np.array_equal(r0[0], r1[0]) and np.array_equal(r0[0], r2[0]) and np.array_equal(r0[0], r3[0]) + d0 = pytest.approx(r0[1]) + assert r1[1] == d0 and r2[1] == d0 and r3[1] == d0 + + +def test_knn_nop(): + # This doesn't look super useful... + p = np.array([[0.0]]) + assert None is KNN(k=1, return_index=False, return_distance=False, implementation="sklearn").fit_transform(p) + assert None is KNN(k=1, return_index=False, return_distance=False, implementation="ckdtree").fit_transform(p) + assert None is KNN(k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5).fit_transform(p) + assert None is KNN(k=1, return_index=False, return_distance=False, implementation="keops").fit_transform(p) + assert None is KNN(k=1, return_index=False, return_distance=False, metric="precomputed").fit_transform(p) -- cgit v1.2.3 From 35a12b553c85af8ce31629b90a27a7071b0cc379 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 11:48:43 +0100 Subject: Doc tweaks, default DTM exponent --- src/python/doc/point_cloud.rst | 6 ++++-- src/python/doc/point_cloud_sum.inc | 4 ++-- src/python/gudhi/point_cloud/dtm.py | 17 ++++++++++++----- src/python/gudhi/point_cloud/knn.py | 6 +++--- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst index 351b0786..192f70db 100644 --- a/src/python/doc/point_cloud.rst +++ b/src/python/doc/point_cloud.rst @@ -28,11 +28,12 @@ Time Delay Embedding :members: :special-members: __call__ -Nearest neighbors ------------------ +K nearest neighbors +------------------- .. automodule:: gudhi.point_cloud.knn :members: + :undoc-members: :special-members: __init__ Distance to measure @@ -40,4 +41,5 @@ Distance to measure .. automodule:: gudhi.point_cloud.dtm :members: + :undoc-members: :special-members: __init__ diff --git a/src/python/doc/point_cloud_sum.inc b/src/python/doc/point_cloud_sum.inc index ecc18951..d4761aba 100644 --- a/src/python/doc/point_cloud_sum.inc +++ b/src/python/doc/point_cloud_sum.inc @@ -2,8 +2,8 @@ :widths: 30 40 30 +----------------------------------------------------------------+------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ - | | :math:`(x_1, x_2, \ldots, x_d)` | Utilities to process point clouds: read from file, subsample, etc. | :Author: Vincent Rouvreau | - | | :math:`(y_1, y_2, \ldots, y_d)` | | | + | | :math:`(x_1, x_2, \ldots, x_d)` | Utilities to process point clouds: read from file, subsample, | :Authors: Vincent Rouvreau, Marc Glisse, Masatoshi Takenouchi | + | | :math:`(y_1, y_2, \ldots, y_d)` | find neighbors, embed time series in higher dimension, etc. | | | | | :Since: GUDHI 2.0.0 | | | | | | | | :License: MIT (`GPL v3 `_, BSD-3-Clause, Apache-2.0) | diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index e4096c5e..520cbea8 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -15,10 +15,11 @@ class DTM: Class to compute the distance to the empirical measure defined by a point set. """ - def __init__(self, k, q=2, **kwargs): + def __init__(self, k, q=None, **kwargs): """ Args: - q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. + k (int): number of neighbors (possibly including the point itself). + q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k @@ -31,7 +32,7 @@ class DTM: def fit(self, X, y=None): """ Args: - X (numpy.array): coordinates for mass points + X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": # KNN gives sorted distances, which is unnecessary here. @@ -45,11 +46,17 @@ class DTM: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). """ + q = self.q + if q is None: + if self.params["metric"] in {"neighbors", "precomputed"}: + q = 2 + else: + q = len(X[0]) if self.params["metric"] == "neighbors": distances = X[:, : self.k] else: distances = self.knn.transform(X) - distances = distances ** self.q + distances = distances ** q dtm = distances.sum(-1) / self.k - dtm = dtm ** (1.0 / self.q) + dtm = dtm ** (1.0 / q) return dtm diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 02448530..31e4fc9f 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -18,7 +18,7 @@ class KNN: def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): """ Args: - k (int): number of neighbors (including the point itself). + k (int): number of neighbors (possibly including the point itself). return_index (bool): if True, return the index of each neighbor. return_distance (bool): if True, return the distance to each neighbor. implementation (str): Choice of the library that does the real work. @@ -68,7 +68,7 @@ class KNN: def fit(self, X, y=None): """ Args: - X (numpy.array): coordinates for reference points + X (numpy.array): coordinates for reference points. """ self.ref_points = X if self.params["implementation"] == "ckdtree": @@ -105,7 +105,7 @@ class KNN: def transform(self, X): """ Args: - X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed" + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". """ metric = self.metric k = self.k -- cgit v1.2.3 From a911f9707d44259a38ae3dbb6fbcec75779fc639 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:17:29 +0100 Subject: doc --- src/python/gudhi/point_cloud/dtm.py | 2 +- src/python/gudhi/point_cloud/knn.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 520cbea8..3ac69f31 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -20,7 +20,7 @@ class DTM: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". - kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 31e4fc9f..bb7757f2 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -21,7 +21,7 @@ class KNN: k (int): number of neighbors (possibly including the point itself). return_index (bool): if True, return the index of each neighbor. return_distance (bool): if True, return the distance to each neighbor. - implementation (str): Choice of the library that does the real work. + implementation (str): choice of the library that does the real work. * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported. * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. @@ -31,7 +31,7 @@ class KNN: metric (str): see `sklearn.neighbors.NearestNeighbors`. eps (float): relative error when computing nearest neighbors with the cKDTree. p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. - n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. + n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. kwargs: additional parameters are forwarded to the backends. """ -- cgit v1.2.3 From 990d54f2f13e116f97c1d0f35cbb751015d863fe Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:20:57 +0100 Subject: Fix test --- src/python/test/test_dtm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 93b13e1a..1d080ab4 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -30,7 +30,7 @@ def test_dtm_compare_euclidean(): from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, metric="precomputed") + dtm = DTM(k, q=2, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) dtm = DTM(k, implementation="keops") -- cgit v1.2.3 From 40f4b6fb1fe20c3843b1fd80f99996e6d25c9426 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:26:36 +0100 Subject: Comment --- src/python/gudhi/point_cloud/dtm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 3ac69f31..ba011eaf 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -59,4 +59,6 @@ class DTM: distances = distances ** q dtm = distances.sum(-1) / self.k dtm = dtm ** (1.0 / q) + # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller. + # Add option to skip the final root? return dtm -- cgit v1.2.3 From 7f323484acdeafca93efdd9bdd20ed428f8fb95b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:45:00 +0100 Subject: Optional sort_results --- src/python/gudhi/point_cloud/dtm.py | 4 +--- src/python/gudhi/point_cloud/knn.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index ba011eaf..678524f2 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -35,9 +35,7 @@ class DTM: X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": - # KNN gives sorted distances, which is unnecessary here. - # Maybe add a parameter to say we don't need sorting? - self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) + self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) self.knn.fit(X) return self diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index bb7757f2..8369f1f8 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -33,6 +33,9 @@ class KNN: p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. + sort_results (bool): if True, then distances and indices of each point are + sorted on return, so that the first column contains the closest points. + Otherwise, neighbors are returned in an arbitrary order. Defaults to True. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -115,18 +118,22 @@ class KNN: X = numpy.array(X) if self.return_index: neighbors = numpy.argpartition(X, k - 1)[:, 0:k] - distances = numpy.take_along_axis(X, neighbors, axis=-1) - ngb_order = numpy.argsort(distances, axis=-1) - neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + if self.params.get("sort_results", True): + X = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(X, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + else: + ngb_order = neighbors if self.return_distance: - distances = numpy.take_along_axis(distances, ngb_order, axis=-1) + distances = numpy.take_along_axis(X, ngb_order, axis=-1) return neighbors, distances else: return neighbors if self.return_distance: distances = numpy.partition(X, k - 1)[:, 0:k] - # partition is not guaranteed to sort the lower half, although it often does - distances.sort(axis=-1) + if self.params.get("sort_results"): + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) return distances return None -- cgit v1.2.3 From 75286efcf311f0c7c46a7039970d663f60953e14 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:59:01 +0100 Subject: Fix test --- src/python/test/test_dtm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 1d080ab4..33b2f3a2 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -30,7 +30,7 @@ def test_dtm_compare_euclidean(): from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, q=2, metric="precomputed") + dtm = DTM(k, q=4, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) dtm = DTM(k, implementation="keops") -- cgit v1.2.3 From dd9457649d0d197bbed6402200e0f2f55655680e Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 15:39:15 +0100 Subject: Default param of 2 for DTM --- src/python/gudhi/point_cloud/dtm.py | 14 ++++---------- src/python/test/test_dtm.py | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 678524f2..c26ba844 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -15,11 +15,11 @@ class DTM: Class to compute the distance to the empirical measure defined by a point set. """ - def __init__(self, k, q=None, **kwargs): + def __init__(self, k, q=2, **kwargs): """ Args: k (int): number of neighbors (possibly including the point itself). - q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". + q (float): order used to compute the distance to measure. Defaults to 2. kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k @@ -44,19 +44,13 @@ class DTM: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). """ - q = self.q - if q is None: - if self.params["metric"] in {"neighbors", "precomputed"}: - q = 2 - else: - q = len(X[0]) if self.params["metric"] == "neighbors": distances = X[:, : self.k] else: distances = self.knn.transform(X) - distances = distances ** q + distances = distances ** self.q dtm = distances.sum(-1) / self.k - dtm = dtm ** (1.0 / q) + dtm = dtm ** (1.0 / self.q) # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller. # Add option to skip the final root? return dtm diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 33b2f3a2..93b13e1a 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -30,7 +30,7 @@ def test_dtm_compare_euclidean(): from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, q=4, metric="precomputed") + dtm = DTM(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) dtm = DTM(k, implementation="keops") -- cgit v1.2.3 From 8d06fbeae596a0372bf9a921de7d04cc734eaa3b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 30 Mar 2020 08:14:46 +0200 Subject: Biblio --- biblio/bibliography.bib | 15 +++++++++++++++ src/python/gudhi/point_cloud/dtm.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/biblio/bibliography.bib b/biblio/bibliography.bib index 3bbe7960..f9d43638 100644 --- a/biblio/bibliography.bib +++ b/biblio/bibliography.bib @@ -1192,3 +1192,18 @@ numpages = {11}, location = {Montr\'{e}al, Canada}, series = {NIPS’18} } +@Article{dtm, +author={Chazal, Fr{\'e}d{\'e}ric +and Cohen-Steiner, David +and M{\'e}rigot, Quentin}, +title={Geometric Inference for Probability Measures}, +journal={Foundations of Computational Mathematics}, +year={2011}, +volume={11}, +number={6}, +pages={733-751}, +abstract={Data often comes in the form of a point cloud sampled from an unknown compact subset of Euclidean space. The general goal of geometric inference is then to recover geometric and topological features (e.g., Betti numbers, normals) of this subset from the approximating point cloud data. It appears that the study of distance functions allows one to address many of these questions successfully. However, one of the main limitations of this framework is that it does not cope well with outliers or with background noise. In this paper, we show how to extend the framework of distance functions to overcome this problem. Replacing compact subsets by measures, we introduce a notion of distance function to a probability distribution in Rd. These functions share many properties with classical distance functions, which make them suitable for inference purposes. In particular, by considering appropriate level sets of these distance functions, we show that it is possible to reconstruct offsets of sampled shapes with topological guarantees even in the presence of outliers. Moreover, in settings where empirical measures are considered, these functions can be easily evaluated, making them of particular practical interest.}, +issn={1615-3383}, +doi={10.1007/s10208-011-9098-0}, +url={https://doi.org/10.1007/s10208-011-9098-0} +} diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index c26ba844..23c36b88 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -12,7 +12,7 @@ from .knn import KNN class DTM: """ - Class to compute the distance to the empirical measure defined by a point set. + Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`. """ def __init__(self, k, q=2, **kwargs): -- cgit v1.2.3 From 0a404547afec2e43dd5edf9410ff079d156d691a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 30 Mar 2020 08:18:38 +0200 Subject: One more ref, to be cited later --- biblio/bibliography.bib | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/biblio/bibliography.bib b/biblio/bibliography.bib index f9d43638..056ccd72 100644 --- a/biblio/bibliography.bib +++ b/biblio/bibliography.bib @@ -1207,3 +1207,15 @@ issn={1615-3383}, doi={10.1007/s10208-011-9098-0}, url={https://doi.org/10.1007/s10208-011-9098-0} } +@article{dtmdensity, +author = "Biau, Gérard and Chazal, Frédéric and Cohen-Steiner, David and Devroye, Luc and Rodríguez, Carlos", +doi = "10.1214/11-EJS606", +fjournal = "Electronic Journal of Statistics", +journal = "Electron. J. Statist.", +pages = "204--237", +publisher = "The Institute of Mathematical Statistics and the Bernoulli Society", +title = "A weighted k-nearest neighbor density estimate for geometric inference", +url = "https://doi.org/10.1214/11-EJS606", +volume = "5", +year = "2011" +} -- cgit v1.2.3 From f9a933862050ca95b3a96d7a8572d62f7f2205a9 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 11 Apr 2020 18:18:14 +0200 Subject: Use longer names --- src/python/gudhi/point_cloud/dtm.py | 10 +++-- src/python/gudhi/point_cloud/knn.py | 2 +- src/python/test/test_dtm.py | 18 ++++----- src/python/test/test_knn.py | 76 +++++++++++++++++++++++++++---------- 4 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 23c36b88..38368f29 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -7,10 +7,10 @@ # Modification(s): # - YYYY/MM Author: Description of the modification -from .knn import KNN +from .knn import KNearestNeighbors -class DTM: +class DistanceToMeasure: """ Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`. """ @@ -20,7 +20,7 @@ class DTM: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to 2. - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q @@ -35,7 +35,9 @@ class DTM: X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": - self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) + self.knn = KNearestNeighbors( + self.k, return_index=False, return_distance=True, sort_results=False, **self.params + ) self.knn.fit(X) return self diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 8369f1f8..6642a3c2 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -10,7 +10,7 @@ import numpy -class KNN: +class KNearestNeighbors: """ Class wrapping several implementations for computing the k nearest neighbors in a point set. """ diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 93b13e1a..37934fdb 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -8,7 +8,7 @@ - YYYY/MM Author: Description of the modification """ -from gudhi.point_cloud.dtm import DTM +from gudhi.point_cloud.dtm import DistanceToMeasure import numpy import pytest @@ -16,35 +16,35 @@ import pytest def test_dtm_compare_euclidean(): pts = numpy.random.rand(1000, 4) k = 3 - dtm = DTM(k, implementation="ckdtree") + dtm = DistanceToMeasure(k, implementation="ckdtree") r0 = dtm.fit_transform(pts) - dtm = DTM(k, implementation="sklearn") + dtm = DistanceToMeasure(k, implementation="sklearn") r1 = dtm.fit_transform(pts) assert r1 == pytest.approx(r0) - dtm = DTM(k, implementation="sklearn", algorithm="brute") + dtm = DistanceToMeasure(k, implementation="sklearn", algorithm="brute") r2 = dtm.fit_transform(pts) assert r2 == pytest.approx(r0) - dtm = DTM(k, implementation="hnsw") + dtm = DistanceToMeasure(k, implementation="hnsw") r3 = dtm.fit_transform(pts) assert r3 == pytest.approx(r0) from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, metric="precomputed") + dtm = DistanceToMeasure(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) - dtm = DTM(k, implementation="keops") + dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) def test_dtm_precomputed(): dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]]) - dtm = DTM(2, q=1, metric="neighbors") + dtm = DistanceToMeasure(2, q=1, metric="neighbors") r = dtm.fit_transform(dist) assert r == pytest.approx([2.0, 3, 1]) dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) - dtm = DTM(2, q=2, metric="neighbors") + dtm = DistanceToMeasure(2, q=2, metric="neighbors") r = dtm.fit_transform(dist) assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01) diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index e455fb48..6aac2006 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -8,7 +8,7 @@ - YYYY/MM Author: Description of the modification """ -from gudhi.point_cloud.knn import KNN +from gudhi.point_cloud.knn import KNearestNeighbors import numpy as np import pytest @@ -16,39 +16,39 @@ import pytest def test_knn_explicit(): base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) query = np.array([[1.0, 1], [2, 2], [4, 4]]) - knn = KNN(2, metric="manhattan", return_distance=True, return_index=True) + knn = KNearestNeighbors(2, metric="manhattan", return_distance=True, return_index=True) knn.fit(base) r = knn.transform(query) assert r[0] == pytest.approx(np.array([[0, 1], [1, 0], [3, 2]])) assert r[1] == pytest.approx(np.array([[0.0, 1], [1, 2], [1, 2]])) - knn = KNN(2, metric="chebyshev", return_distance=True, return_index=False) + knn = KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False) knn.fit(base) r = knn.transform(query) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) r = ( - KNN(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops") + KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops") .fit(base) .transform(query) ) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) - knn = KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True) + knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True) knn.fit(base) r = knn.transform(query) assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) r = ( - KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops") + KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops") .fit(base) .transform(query) ) assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) dist = np.array([[0.0, 3, 8], [1, 0, 5], [1, 2, 0]]) - knn = KNN(2, metric="precomputed", return_index=True, return_distance=False) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False) r = knn.fit_transform(dist) assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) - knn = KNN(2, metric="precomputed", return_index=True, return_distance=True) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True) r = knn.fit_transform(dist) assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) @@ -57,16 +57,40 @@ def test_knn_explicit(): def test_knn_compare(): base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) query = np.array([[1.0, 1], [2, 2], [4, 4]]) - r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=False).fit(base).transform(query) - r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=False).fit(base).transform(query) - r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query) - r3 = KNN(2, implementation="keops", return_index=True, return_distance=False).fit(base).transform(query) + r0 = ( + KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) + r1 = ( + KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) + r2 = ( + KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query) + ) + r3 = ( + KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) assert np.array_equal(r0, r1) and np.array_equal(r0, r2) and np.array_equal(r0, r3) - r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=True).fit(base).transform(query) - r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=True).fit(base).transform(query) - r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query) - r3 = KNN(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query) + r0 = ( + KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=True) + .fit(base) + .transform(query) + ) + r1 = ( + KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=True) + .fit(base) + .transform(query) + ) + r2 = KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query) + r3 = ( + KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query) + ) assert np.array_equal(r0[0], r1[0]) and np.array_equal(r0[0], r2[0]) and np.array_equal(r0[0], r3[0]) d0 = pytest.approx(r0[1]) assert r1[1] == d0 and r2[1] == d0 and r3[1] == d0 @@ -75,8 +99,18 @@ def test_knn_compare(): def test_knn_nop(): # This doesn't look super useful... p = np.array([[0.0]]) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="sklearn").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="ckdtree").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5).fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="keops").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, metric="precomputed").fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="sklearn" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="ckdtree" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5 + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="keops" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, metric="precomputed" + ).fit_transform(p) -- cgit v1.2.3 From 83a1bc1fb6124a35d515f4836d2e830f3dbdf0e7 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sun, 12 Apr 2020 21:57:51 +0200 Subject: Parallelize the "precomputed" case of knn It is supposed to be possible to compile numpy with openmp, but it looks like it isn't done in any of the usual packages. It may be possible to refactor that code so there is less redundancy. --- src/python/gudhi/point_cloud/knn.py | 78 +++++++++++++++++++++++++++++-------- src/python/test/test_dtm.py | 3 ++ src/python/test/test_knn.py | 8 ++++ 3 files changed, 73 insertions(+), 16 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 6642a3c2..f6870517 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -115,25 +115,71 @@ class KNearestNeighbors: if metric == "precomputed": # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural. - X = numpy.array(X) if self.return_index: - neighbors = numpy.argpartition(X, k - 1)[:, 0:k] - if self.params.get("sort_results", True): - X = numpy.take_along_axis(X, neighbors, axis=-1) - ngb_order = numpy.argsort(X, axis=-1) - neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + n_jobs = self.params.get("n_jobs", 1) + # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?! + if n_jobs == 1: + neighbors = numpy.argpartition(X, k - 1)[:, 0:k] + if self.params.get("sort_results", True): + X = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(X, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + else: + ngb_order = neighbors + if self.return_distance: + distances = numpy.take_along_axis(X, ngb_order, axis=-1) + return neighbors, distances + else: + return neighbors else: - ngb_order = neighbors - if self.return_distance: - distances = numpy.take_along_axis(X, ngb_order, axis=-1) - return neighbors, distances - else: - return neighbors + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + slices = gen_even_slices(len(X), effective_n_jobs(-1)) + parallel = Parallel(backend="threading", n_jobs=-1) + if self.params.get("sort_results", True): + + def func(M): + neighbors = numpy.argpartition(M, k - 1)[:, 0:k] + Y = numpy.take_along_axis(M, neighbors, axis=-1) + ngb_order = numpy.argsort(Y, axis=-1) + return numpy.take_along_axis(neighbors, ngb_order, axis=-1) + + else: + + def func(M): + return numpy.argpartition(M, k - 1)[:, 0:k] + + neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) + if self.return_distance: + distances = numpy.take_along_axis(X, neighbors, axis=-1) + return neighbors, distances + else: + return neighbors if self.return_distance: - distances = numpy.partition(X, k - 1)[:, 0:k] - if self.params.get("sort_results"): - # partition is not guaranteed to sort the lower half, although it often does - distances.sort(axis=-1) + n_jobs = self.params.get("n_jobs", 1) + if n_jobs == 1: + distances = numpy.partition(X, k - 1)[:, 0:k] + if self.params.get("sort_results"): + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) + else: + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + if self.params.get("sort_results"): + + def func(M): + # Not partitioning in place, because we should not modify the user's array? + r = numpy.partition(M, k - 1)[:, 0:k] + r.sort(axis=-1) + return r + + else: + func = lambda M: numpy.partition(M, k - 1)[:, 0:k] + slices = gen_even_slices(len(X), effective_n_jobs(-1)) + parallel = Parallel(backend="threading", n_jobs=-1) + distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) return distances return None diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 37934fdb..bc0d3698 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -33,6 +33,9 @@ def test_dtm_compare_euclidean(): dtm = DistanceToMeasure(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) + dtm = DistanceToMeasure(k, metric="precomputed", n_jobs=2) + r4b = dtm.fit_transform(d) + assert r4b == pytest.approx(r0) dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index 6aac2006..6269df54 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -52,6 +52,14 @@ def test_knn_explicit(): r = knn.fit_transform(dist) assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) + # Second time in parallel + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2) + r = knn.fit_transform(dist) + assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, n_jobs=2) + r = knn.fit_transform(dist) + assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) + assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) def test_knn_compare(): -- cgit v1.2.3 From 280eb9d2323837619db1ae013b929adb9b45013b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 01:09:45 +0200 Subject: enable_autodiff with keops This doesn't seem like the best way to handle it, we may want to handle it like a wrapper that gets the indices from knn (whatever backend) and then computes the distances. --- src/python/gudhi/point_cloud/knn.py | 33 +++++++++++++++++++++++++++++---- src/python/test/test_dtm.py | 8 ++++++++ src/python/test/test_knn.py | 6 ++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index f6870517..79362c09 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -36,6 +36,9 @@ class KNearestNeighbors: sort_results (bool): if True, then distances and indices of each point are sorted on return, so that the first column contains the closest points. Otherwise, neighbors are returned in an arbitrary order. Defaults to True. + enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.array or similar, this instructs + the function to compute distances in a way that works with automatic differentiation. + This is experimental and not supported for all implementations. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -202,13 +205,18 @@ class KNearestNeighbors: if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor + import eagerpy as ep # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.tensor(X, dtype=torch.float32) - if X is self.ref_points: + queries = X + X = ep.astensor(X) + XX = torch.as_tensor(X.numpy(), dtype=torch.float32) + if queries is self.ref_points: + Y = X YY = XX else: - YY = torch.tensor(self.ref_points, dtype=torch.float32) + Y = ep.astensor(self.ref_points) + YY = torch.as_tensor(Y.numpy(), dtype=torch.float32) p = self.params["p"] if p == numpy.inf: @@ -219,6 +227,24 @@ class KNearestNeighbors: else: mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) + # pykeops does not support autodiff for kmin yet :-( + if self.params.get("enable_autodiff", False) and self.return_distance: + # Compute the indices of the neighbors, and recompute the relevant distances autodiff-friendly. + # Another strategy would be to compute the whole distance matrix with torch.cdist + # and use neighbors as indices into it. + neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() + neighbor_pts = Y[neighbors] + diff = neighbor_pts - X[:, None, :] + if p == numpy.inf: + distances = diff.abs().max(-1) + elif p == 2: + distances = (diff ** 2).sum(-1) ** 0.5 + else: + distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) + if self.return_index: + return neighbors.raw, distances.raw + else: + return distances.raw if self.return_index: if self.return_distance: distances, neighbors = mat.Kmin_argKmin(k, dim=1) @@ -234,7 +260,6 @@ class KNearestNeighbors: distances = distances ** (1.0 / p) return distances return None - # FIXME: convert everything back to numpy arrays or not? if self.params["implementation"] == "ckdtree": qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index bc0d3698..8709dd07 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -11,6 +11,7 @@ from gudhi.point_cloud.dtm import DistanceToMeasure import numpy import pytest +import torch def test_dtm_compare_euclidean(): @@ -39,6 +40,13 @@ def test_dtm_compare_euclidean(): dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) + pts2 = torch.tensor(pts, requires_grad=True) + assert pts2.grad is None + dtm = DistanceToMeasure(k, implementation="keops", enable_autodiff=True) + r6 = dtm.fit_transform(pts2) + assert r6.detach().numpy() == pytest.approx(r0) + r6.sum().backward() + assert pts2.grad is not None def test_dtm_precomputed(): diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index 6269df54..415c9d48 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -32,6 +32,12 @@ def test_knn_explicit(): .transform(query) ) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) + r = ( + KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops", enable_autodiff=True) + .fit(base) + .transform(query) + ) + assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True) knn.fit(base) -- cgit v1.2.3 From 2f1576a23cf4ac055565875d384ca604c0ff6844 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 15:01:51 +0200 Subject: Small autodiff tweaks --- src/python/gudhi/point_cloud/knn.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 79362c09..ab3447d4 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -233,16 +233,17 @@ class KNearestNeighbors: # Another strategy would be to compute the whole distance matrix with torch.cdist # and use neighbors as indices into it. neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() - neighbor_pts = Y[neighbors] + # Work around https://github.com/pytorch/pytorch/issues/34452 + neighbor_pts = Y[neighbors,] diff = neighbor_pts - X[:, None, :] if p == numpy.inf: distances = diff.abs().max(-1) elif p == 2: - distances = (diff ** 2).sum(-1) ** 0.5 + distances = (diff ** 2).sum(-1).sqrt() else: distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) if self.return_index: - return neighbors.raw, distances.raw + return neighbors, distances.raw else: return distances.raw if self.return_index: -- cgit v1.2.3 From 3a86402b733a48d9c25a4995325e72c7438c06c0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 15:21:06 +0200 Subject: Fix NaN gradient with pytorch --- src/python/gudhi/point_cloud/knn.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index ab3447d4..185a7764 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -236,12 +236,11 @@ class KNearestNeighbors: # Work around https://github.com/pytorch/pytorch/issues/34452 neighbor_pts = Y[neighbors,] diff = neighbor_pts - X[:, None, :] - if p == numpy.inf: - distances = diff.abs().max(-1) - elif p == 2: - distances = (diff ** 2).sum(-1).sqrt() + if isinstance(diff, ep.PyTorchTensor): + # https://github.com/jonasrauber/eagerpy/issues/6 + distances = ep.astensor(diff.raw.norm(p, -1)) else: - distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) + distances = diff.norms.lp(p, -1) if self.return_index: return neighbors, distances.raw else: -- cgit v1.2.3 From 3afce326428dddd638e22ab37ee4b2afe52eba75 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 20:32:39 +0200 Subject: Generalize enable_autodiff to more implementations Still limited to L^p --- src/python/gudhi/point_cloud/knn.py | 76 +++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 185a7764..87b2798e 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -9,6 +9,7 @@ import numpy +# TODO: https://github.com/facebookresearch/faiss class KNearestNeighbors: """ @@ -67,6 +68,8 @@ class KNearestNeighbors: self.params["implementation"] = "ckdtree" else: self.params["implementation"] = "sklearn" + if not return_distance: + self.params["enable_autodiff"] = False def fit_transform(self, X, y=None): return self.fit(X).transform(X) @@ -77,6 +80,10 @@ class KNearestNeighbors: X (numpy.array): coordinates for reference points. """ self.ref_points = X + if self.params.get("enable_autodiff", False): + import eagerpy as ep + if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): + X = ep.astensor(X).numpy() if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree @@ -113,6 +120,41 @@ class KNearestNeighbors: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". """ + if self.params.get("enable_autodiff", False): + # pykeops does not support autodiff for kmin yet, but when it does in the future, + # we may want a special path. + import eagerpy as ep + save_return_index = self.return_index + self.return_index = True + self.return_distance = False + self.params["enable_autodiff"] = False + try: + # FIXME: how do we test "X is ref_points" then? + newX = ep.astensor(X) + if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): + newX = newX.numpy() + neighbors = self.transform(newX) + finally: + self.return_index = save_return_index + self.return_distance = True + self.params["enable_autodiff"] = True + # We can implement more later as needed + assert self.metric == "minkowski" + p = self.params["p"] + Y = ep.astensor(self.ref_points) + neighbor_pts = Y[neighbors,] + diff = neighbor_pts - X[:, None, :] + if isinstance(diff, ep.PyTorchTensor): + # https://github.com/jonasrauber/eagerpy/issues/6 + distances = ep.astensor(diff.raw.norm(p, -1)) + else: + distances = diff.norms.lp(p, -1) + if self.return_index: + return neighbors, distances.raw + else: + return distances.raw + + metric = self.metric k = self.k @@ -207,16 +249,26 @@ class KNearestNeighbors: from pykeops.torch import LazyTensor import eagerpy as ep - # 'float64' is slow except on super expensive GPUs. Allow it with some param? queries = X X = ep.astensor(X) - XX = torch.as_tensor(X.numpy(), dtype=torch.float32) + if isinstance(X, ep.PyTorchTensor): + XX = X.raw + else: + # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch + # without copying to/from the CPU. + XX = X.numpy() + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.as_tensor(XX, dtype=torch.float32) if queries is self.ref_points: Y = X YY = XX else: Y = ep.astensor(self.ref_points) - YY = torch.as_tensor(Y.numpy(), dtype=torch.float32) + if isinstance(Y, ep.PyTorchTensor): + YY = Y.raw + else: + YY = Y.numpy() + YY = torch.as_tensor(YY, dtype=torch.float32) p = self.params["p"] if p == numpy.inf: @@ -227,24 +279,6 @@ class KNearestNeighbors: else: mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) - # pykeops does not support autodiff for kmin yet :-( - if self.params.get("enable_autodiff", False) and self.return_distance: - # Compute the indices of the neighbors, and recompute the relevant distances autodiff-friendly. - # Another strategy would be to compute the whole distance matrix with torch.cdist - # and use neighbors as indices into it. - neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() - # Work around https://github.com/pytorch/pytorch/issues/34452 - neighbor_pts = Y[neighbors,] - diff = neighbor_pts - X[:, None, :] - if isinstance(diff, ep.PyTorchTensor): - # https://github.com/jonasrauber/eagerpy/issues/6 - distances = ep.astensor(diff.raw.norm(p, -1)) - else: - distances = diff.norms.lp(p, -1) - if self.return_index: - return neighbors, distances.raw - else: - return distances.raw if self.return_index: if self.return_distance: distances, neighbors = mat.Kmin_argKmin(k, dim=1) -- cgit v1.2.3 From 521d8c17c2b7d71c46a51f0490ff2c13c809fc87 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 21:13:19 +0200 Subject: Remove left-over code eagerpy is only used with enable_autodiff --- src/python/gudhi/point_cloud/knn.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 87b2798e..f2cddb38 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -82,8 +82,11 @@ class KNearestNeighbors: self.ref_points = X if self.params.get("enable_autodiff", False): import eagerpy as ep + X = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): - X = ep.astensor(X).numpy() + # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch + # without copying to/from the CPU. + X = X.numpy() if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree @@ -133,6 +136,8 @@ class KNearestNeighbors: newX = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): newX = newX.numpy() + else: + newX = X neighbors = self.transform(newX) finally: self.return_index = save_return_index @@ -247,29 +252,13 @@ class KNearestNeighbors: if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor - import eagerpy as ep - queries = X - X = ep.astensor(X) - if isinstance(X, ep.PyTorchTensor): - XX = X.raw - else: - # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch - # without copying to/from the CPU. - XX = X.numpy() # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.as_tensor(XX, dtype=torch.float32) - if queries is self.ref_points: - Y = X + XX = torch.as_tensor(X, dtype=torch.float32) + if X is self.ref_points: YY = XX else: - Y = ep.astensor(self.ref_points) - if isinstance(Y, ep.PyTorchTensor): - YY = Y.raw - else: - YY = Y.numpy() - YY = torch.as_tensor(YY, dtype=torch.float32) - + YY = torch.as_tensor(self.ref_points, dtype=torch.float32) p = self.params["p"] if p == numpy.inf: # Requires pykeops 1.4 or later -- cgit v1.2.3 From ce75f66da5a2d7ad2c479355112d48817c5ba68b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 21:38:24 +0200 Subject: Tweak to detect fit_transform --- src/python/gudhi/point_cloud/knn.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index f2cddb38..8b3cdb46 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,7 @@ import numpy # TODO: https://github.com/facebookresearch/faiss + class KNearestNeighbors: """ Class wrapping several implementations for computing the k nearest neighbors in a point set. @@ -82,6 +83,7 @@ class KNearestNeighbors: self.ref_points = X if self.params.get("enable_autodiff", False): import eagerpy as ep + X = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch @@ -127,17 +129,19 @@ class KNearestNeighbors: # pykeops does not support autodiff for kmin yet, but when it does in the future, # we may want a special path. import eagerpy as ep + save_return_index = self.return_index self.return_index = True self.return_distance = False self.params["enable_autodiff"] = False try: - # FIXME: how do we test "X is ref_points" then? newX = ep.astensor(X) - if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): + if self.params["implementation"] != "keops" or ( + not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor) + ): newX = newX.numpy() else: - newX = X + newX = newX.raw neighbors = self.transform(newX) finally: self.return_index = save_return_index @@ -159,7 +163,6 @@ class KNearestNeighbors: else: return distances.raw - metric = self.metric k = self.k -- cgit v1.2.3 From f0c5aab988ee966510503a30b0591105594ac67d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 14 Apr 2020 15:37:31 +0200 Subject: More testing --- src/python/test/test_dtm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 8709dd07..db3e5df5 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -47,6 +47,13 @@ def test_dtm_compare_euclidean(): assert r6.detach().numpy() == pytest.approx(r0) r6.sum().backward() assert pts2.grad is not None + pts2 = torch.tensor(pts, requires_grad=True) + assert pts2.grad is None + dtm = DistanceToMeasure(k, implementation="ckdtree", enable_autodiff=True) + r7 = dtm.fit_transform(pts2) + assert r7.detach().numpy() == pytest.approx(r0) + r7.sum().backward() + assert pts2.grad is not None def test_dtm_precomputed(): -- cgit v1.2.3 From b908205e85bbe29c8d18ad1f38e783a1327434d7 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 14 Apr 2020 17:00:27 +0200 Subject: EagerPy in cmake --- src/cmake/modules/GUDHI_third_party_libraries.cmake | 1 + src/python/CMakeLists.txt | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake index a931b3a1..0abe66b7 100644 --- a/src/cmake/modules/GUDHI_third_party_libraries.cmake +++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake @@ -181,6 +181,7 @@ if( PYTHONINTERP_FOUND ) find_python_module("pybind11") find_python_module("torch") find_python_module("pykeops") + find_python_module("eagerpy") find_python_module_no_version("hnswlib") endif() diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index d7a6a4db..99e8b57c 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -88,6 +88,9 @@ if(PYTHONINTERP_FOUND) if(PYKEOPS_FOUND) add_gudhi_debug_info("PyKeOps version ${PYKEOPS_VERSION}") endif() + if(EAGERPY_FOUND) + add_gudhi_debug_info("EagerPy version ${EAGERPY_VERSION}") + endif() set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ") set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ") @@ -410,7 +413,7 @@ if(PYTHONINTERP_FOUND) add_gudhi_py_test(test_time_delay) # DTM - if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND) + if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND AND EAGERPY_FOUND) add_gudhi_py_test(test_knn) add_gudhi_py_test(test_dtm) endif() -- cgit v1.2.3 From 9518287cfa2a62948ede2e7d17d5c9f29092e0f4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 14 Apr 2020 18:27:19 +0200 Subject: Doc improvements --- src/python/gudhi/point_cloud/dtm.py | 12 ++++++++++-- src/python/gudhi/point_cloud/knn.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 38368f29..58dec536 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -20,7 +20,9 @@ class DistanceToMeasure: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to 2. - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that + metric="neighbors" means that :func:`transform` expects an array with the distances + to the k nearest neighbors. """ self.k = k self.q = q @@ -44,7 +46,13 @@ class DistanceToMeasure: def transform(self, X): """ Args: - X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", + or distances to the k nearest neighbors if metric is "neighbors" (if the array has more + than k columns, the remaining ones are ignored). + + Returns: + numpy.array: a 1-d array with, for each point of X, its distance to the measure defined + by the argument of :func:`fit`. """ if self.params["metric"] == "neighbors": distances = X[:, : self.k] diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 8b3cdb46..d7cf0b2a 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -38,9 +38,9 @@ class KNearestNeighbors: sort_results (bool): if True, then distances and indices of each point are sorted on return, so that the first column contains the closest points. Otherwise, neighbors are returned in an arbitrary order. Defaults to True. - enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.array or similar, this instructs - the function to compute distances in a way that works with automatic differentiation. - This is experimental and not supported for all implementations. + enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this + instructs the function to compute distances in a way that works with automatic differentiation. + This is experimental and not supported for all metrics. Defaults to False. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -124,6 +124,11 @@ class KNearestNeighbors: """ Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". + + Returns: + numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument + of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the + same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order. """ if self.params.get("enable_autodiff", False): # pykeops does not support autodiff for kmin yet, but when it does in the future, -- cgit v1.2.3 From acb9d5b9d1317d3d8168bc3ac46860d078abba84 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 14 Apr 2020 20:30:29 +0200 Subject: Check that the gradient is not NaN This can easily happen with pytorch, and there is special code to avoid it. --- src/python/test/test_dtm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index db3e5df5..de74c42b 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -46,14 +46,14 @@ def test_dtm_compare_euclidean(): r6 = dtm.fit_transform(pts2) assert r6.detach().numpy() == pytest.approx(r0) r6.sum().backward() - assert pts2.grad is not None + assert pts2.grad is not None and not torch.isnan(pts2.grad).any() pts2 = torch.tensor(pts, requires_grad=True) assert pts2.grad is None dtm = DistanceToMeasure(k, implementation="ckdtree", enable_autodiff=True) r7 = dtm.fit_transform(pts2) assert r7.detach().numpy() == pytest.approx(r0) r7.sum().backward() - assert pts2.grad is not None + assert pts2.grad is not None and not torch.isnan(pts2.grad).any() def test_dtm_precomputed(): -- cgit v1.2.3 From 8d9611206603f4f7506fe77a0273c73c9d67716b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sun, 19 Apr 2020 12:30:35 +0200 Subject: Drop redundant test torch.isnan(None) raises an exception anyway --- src/python/test/test_dtm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index de74c42b..859189fa 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -46,14 +46,14 @@ def test_dtm_compare_euclidean(): r6 = dtm.fit_transform(pts2) assert r6.detach().numpy() == pytest.approx(r0) r6.sum().backward() - assert pts2.grad is not None and not torch.isnan(pts2.grad).any() + assert not torch.isnan(pts2.grad).any() pts2 = torch.tensor(pts, requires_grad=True) assert pts2.grad is None dtm = DistanceToMeasure(k, implementation="ckdtree", enable_autodiff=True) r7 = dtm.fit_transform(pts2) assert r7.detach().numpy() == pytest.approx(r0) r7.sum().backward() - assert pts2.grad is not None and not torch.isnan(pts2.grad).any() + assert not torch.isnan(pts2.grad).any() def test_dtm_precomputed(): -- cgit v1.2.3 From 1c1a99074049e4ff04fa28e7d6e1b6fc2067397a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 20 Apr 2020 10:38:41 +0200 Subject: Add __license__ --- src/python/gudhi/point_cloud/dtm.py | 4 ++++ src/python/gudhi/point_cloud/knn.py | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 58dec536..13e16d24 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -9,6 +9,10 @@ from .knn import KNearestNeighbors +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + class DistanceToMeasure: """ diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index d7cf0b2a..4017e498 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,10 @@ import numpy # TODO: https://github.com/facebookresearch/faiss +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + class KNearestNeighbors: """ @@ -156,7 +160,9 @@ class KNearestNeighbors: assert self.metric == "minkowski" p = self.params["p"] Y = ep.astensor(self.ref_points) - neighbor_pts = Y[neighbors,] + neighbor_pts = Y[ + neighbors, + ] diff = neighbor_pts - X[:, None, :] if isinstance(diff, ep.PyTorchTensor): # https://github.com/jonasrauber/eagerpy/issues/6 -- cgit v1.2.3 From 3a9105e0d3bea5cc64610b7c0c3fb15f0e00bb9d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 20 Apr 2020 11:37:44 +0200 Subject: Reintroduce _proj_on_diag, with a unit test --- src/python/gudhi/wasserstein/wasserstein.py | 11 +++++++++++ src/python/test/test_wasserstein_distance.py | 7 +++++++ 2 files changed, 18 insertions(+) diff --git a/src/python/gudhi/wasserstein/wasserstein.py b/src/python/gudhi/wasserstein/wasserstein.py index 5df66cf9..efc851a0 100644 --- a/src/python/gudhi/wasserstein/wasserstein.py +++ b/src/python/gudhi/wasserstein/wasserstein.py @@ -15,6 +15,17 @@ try: except ImportError: print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT") + +# Currently unused, but Théo says it is likely to be used again. +def _proj_on_diag(X): + ''' + :param X: (n x 2) array encoding the points of a persistent diagram. + :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal + ''' + Z = (X[:,0] + X[:,1]) / 2. + return np.array([Z , Z]).T + + def _dist_to_diag(X, internal_p): ''' :param X: (n x 2) array encoding the points of a persistent diagram. diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py index 7e0d0f5f..1a4acc1d 100755 --- a/src/python/test/test_wasserstein_distance.py +++ b/src/python/test/test_wasserstein_distance.py @@ -8,6 +8,7 @@ - YYYY/MM Author: Description of the modification """ +from gudhi.wasserstein.wasserstein import _proj_on_diag from gudhi.wasserstein import wasserstein_distance as pot from gudhi.hera import wasserstein_distance as hera import numpy as np @@ -17,6 +18,12 @@ __author__ = "Theo Lacombe" __copyright__ = "Copyright (C) 2019 Inria" __license__ = "MIT" +def test_proj_on_diag(): + dgm = np.array([[1., 1.], [1., 2.], [3., 5.]]) + assert np.array_equal(_proj_on_diag(dgm), [[1., 1.], [1.5, 1.5], [4., 4.]]) + empty = np.empty((0, 2)) + assert np.array_equal(_proj_on_diag(empty), empty) + def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_matching=True): diag1 = np.array([[2.7, 3.7], [9.6, 14.0], [34.2, 34.974]]) diag2 = np.array([[2.8, 4.45], [9.5, 14.1]]) -- cgit v1.2.3 From 9ef7ba65367ab2ff92bf66b1b8166c5990530b76 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 20 Apr 2020 12:16:15 +0200 Subject: Explicitly pass sort_results=True on some tests --- src/python/test/test_knn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index 415c9d48..a87ec212 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -54,12 +54,12 @@ def test_knn_explicit(): knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False) r = knn.fit_transform(dist) assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) - knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, sort_results=True) r = knn.fit_transform(dist) assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) # Second time in parallel - knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2, sort_results=True) r = knn.fit_transform(dist) assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, n_jobs=2) -- cgit v1.2.3