From c8c942c43643131a7ef9899826a7095e497150fe Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 22:10:26 +0100 Subject: cmake --- src/python/gudhi/point_cloud/dtm.py | 40 ++++++++ src/python/gudhi/point_cloud/knn.py | 193 ++++++++++++++++++++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 src/python/gudhi/point_cloud/dtm.py create mode 100644 src/python/gudhi/point_cloud/knn.py (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py new file mode 100644 index 00000000..08f9ea60 --- /dev/null +++ b/src/python/gudhi/point_cloud/dtm.py @@ -0,0 +1,40 @@ +from .knn import KNN + + +class DTM: + def __init__(self, k, q=2, **kwargs): + """ + Args: + q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. + kwargs: Same parameters as KNN, except that metric="neighbors" means that transform() expects an array with the distances to the k nearest neighbors. + """ + self.k = k + self.q = q + self.params = kwargs + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for mass points + """ + if self.params.setdefault("metric", "euclidean") != "neighbors": + self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) + self.knn.fit(X) + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). + """ + if self.params["metric"] == "neighbors": + distances = X[:, : self.k] + else: + distances = self.knn.transform(X) + distances = distances ** self.q + dtm = distances.sum(-1) / self.k + dtm = dtm ** (1.0 / self.q) + return dtm diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py new file mode 100644 index 00000000..57078f1e --- /dev/null +++ b/src/python/gudhi/point_cloud/knn.py @@ -0,0 +1,193 @@ +import numpy + + +class KNN: + def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): + """ + Args: + k (int): number of neighbors (including the point itself). + return_index (bool): if True, return the index of each neighbor. + return_distance (bool): if True, return the distance to each neighbor. + implementation (str): Choice of the library that does the real work. + + * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes + large (10+) but the number of points remains low (less than a million). + Only "minkowski" and its aliases are supported. + * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. + * 'sklearn' for scikit-learn's NearestNeighbors. + Note that this provides in particular an option algorithm="brute". + * 'hnsw' for hnswlib.Index. It is very fast but does not provide guarantees. + Only supports "euclidean" for now. + * None will try to select a sensible one (scipy if possible, scikit-learn otherwise). + metric (str): see `sklearn.neighbors.NearestNeighbors`. + eps (float): relative error when computing nearest neighbors with the cKDTree. + p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. + n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. + If -1 is given all processors are used. Default: 1. + + Additional parameters are forwarded to the backends. + """ + self.k = k + self.return_index = return_index + self.return_distance = return_distance + self.metric = metric + self.params = kwargs + # canonicalize + if metric == "euclidean": + self.params["p"] = 2 + self.metric = "minkowski" + elif metric == "manhattan": + self.params["p"] = 1 + self.metric = "minkowski" + elif metric == "chebyshev": + self.params["p"] = numpy.inf + self.metric = "minkowski" + elif metric == "minkowski": + self.params["p"] = kwargs.get("p", 2) + if self.params.get("implementation") in {"keops", "ckdtree"}: + assert self.metric == "minkowski" + if self.params.get("implementation") == "hnsw": + assert self.metric == "minkowski" and self.params["p"] == 2 + if not self.params.get("implementation"): + if self.metric == "minkowski": + self.params["implementation"] = "ckdtree" + else: + self.params["implementation"] = "sklearn" + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for reference points + """ + self.ref_points = X + if self.params.get("implementation") == "ckdtree": + # sklearn could handle this, but it is much slower + from scipy.spatial import cKDTree + self.kdtree = cKDTree(X) + + if self.params.get("implementation") == "sklearn" and self.metric != "precomputed": + # FIXME: sklearn badly handles "precomputed" + from sklearn.neighbors import NearestNeighbors + + nargs = {k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}} + self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) + self.nn.fit(X) + + if self.params.get("implementation") == "hnsw": + import hnswlib + self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances + self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) + n = self.params.get("num_threads") + if n is None: + n = self.params.get("n_jobs", 1) + self.params["num_threads"] = n + self.graph.add_items(X, num_threads=n) + + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed" + """ + metric = self.metric + k = self.k + + if metric == "precomputed": + # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural. + X = numpy.array(X) + if self.return_index: + neighbors = numpy.argpartition(X, k - 1)[:, 0:k] + distances = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(distances, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + if self.return_distance: + distances = numpy.take_along_axis(distances, ngb_order, axis=-1) + return neighbors, distances + else: + return neighbors + if self.return_distance: + distances = numpy.partition(X, k - 1)[:, 0:k] + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) + return distances + return None + + if self.params.get("implementation") == "hnsw": + ef = self.params.get("ef") + if ef is not None: + self.graph.set_ef(ef) + neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"]) + # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn, + # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue. + if self.return_index: + if self.return_distance: + return neighbors, numpy.sqrt(distances) + else: + return neighbors + if self.return_distance: + return numpy.sqrt(distances) + return None + + if self.params.get("implementation") == "keops": + import torch + from pykeops.torch import LazyTensor + + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.tensor(X, dtype=torch.float32) + if X is self.ref_points: + YY = XX + else: + YY = torch.tensor(self.ref_points, dtype=torch.float32) + + p = self.params["p"] + if p == numpy.inf: + # Requires a version of pykeops strictly more recent than 1.3 + mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1) + elif p == 2: # Any even integer? + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1) + else: + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) + + if self.return_index: + if self.return_distance: + distances, neighbors = mat.Kmin_argKmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return neighbors, distances + else: + neighbors = mat.argKmin(k, dim=1) + return neighbors + if self.return_distance: + distances = mat.Kmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return distances + return None + # FIXME: convert everything back to numpy arrays or not? + + if hasattr(self, "kdtree"): + qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} + distances, neighbors = self.kdtree.query(X, k=self.k, **qargs) + if self.return_index: + if self.return_distance: + return neighbors, distances + else: + return neighbors + if self.return_distance: + return distances + return None + + if self.return_distance: + distances, neighbors = self.nn.kneighbors(X, return_distance=True) + if self.return_index: + return neighbors, distances + else: + return distances + if self.return_index: + neighbors = self.nn.kneighbors(X, return_distance=False) + return neighbors + return None -- cgit v1.2.3 From 7ddad8220fdd34fd3ed91e16882feaa3961b2d67 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 22:59:20 +0100 Subject: license --- src/python/gudhi/point_cloud/dtm.py | 9 +++++++++ src/python/gudhi/point_cloud/knn.py | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 08f9ea60..839e7452 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -1,3 +1,12 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + from .knn import KNN diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 57078f1e..943d4e9f 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -1,3 +1,12 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + import numpy -- cgit v1.2.3 From af35ea5b4ce631ae826f1db1940798f254aba658 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 26 Mar 2020 23:39:59 +0100 Subject: clean-up use of "implementation" --- src/python/gudhi/point_cloud/knn.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 943d4e9f..a4ea3acd 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -72,12 +72,12 @@ class KNN: X (numpy.array): coordinates for reference points """ self.ref_points = X - if self.params.get("implementation") == "ckdtree": + if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree self.kdtree = cKDTree(X) - if self.params.get("implementation") == "sklearn" and self.metric != "precomputed": + if self.params["implementation"] == "sklearn" and self.metric != "precomputed": # FIXME: sklearn badly handles "precomputed" from sklearn.neighbors import NearestNeighbors @@ -85,7 +85,7 @@ class KNN: self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) self.nn.fit(X) - if self.params.get("implementation") == "hnsw": + if self.params["implementation"] == "hnsw": import hnswlib self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) @@ -125,7 +125,7 @@ class KNN: return distances return None - if self.params.get("implementation") == "hnsw": + if self.params["implementation"] == "hnsw": ef = self.params.get("ef") if ef is not None: self.graph.set_ef(ef) @@ -141,7 +141,7 @@ class KNN: return numpy.sqrt(distances) return None - if self.params.get("implementation") == "keops": + if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor @@ -178,7 +178,7 @@ class KNN: return None # FIXME: convert everything back to numpy arrays or not? - if hasattr(self, "kdtree"): + if self.params["implementation"] == "ckdtree": qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} distances, neighbors = self.kdtree.query(X, k=self.k, **qargs) if self.return_index: @@ -190,6 +190,7 @@ class KNN: return distances return None + assert self.params["implementation"] == "sklearn" if self.return_distance: distances, neighbors = self.nn.kneighbors(X, return_distance=True) if self.return_index: -- cgit v1.2.3 From f74c71ca8e474ff927cae029ea63329d30293582 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 27 Mar 2020 13:43:58 +0100 Subject: Improve coverage --- src/python/gudhi/point_cloud/dtm.py | 2 ++ src/python/test/test_dtm.py | 48 +++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 15 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 839e7452..541b74a6 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -30,6 +30,8 @@ class DTM: X (numpy.array): coordinates for mass points """ if self.params.setdefault("metric", "euclidean") != "neighbors": + # KNN gives sorted distances, which is unnecessary here. + # Maybe add a parameter to say we don't need sorting? self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) self.knn.fit(X) return self diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 57fdd131..841f8c3c 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -10,23 +10,41 @@ from gudhi.point_cloud.dtm import DTM import numpy +import pytest -def test_dtm_euclidean(): - pts = numpy.random.rand(1000,4) +def test_dtm_compare_euclidean(): + pts = numpy.random.rand(1000, 4) k = 3 - dtm = DTM(k,implementation="ckdtree") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="sklearn") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="sklearn",algorithm="brute") - print(dtm.fit_transform(pts)) - dtm = DTM(k,implementation="hnsw") - print(dtm.fit_transform(pts)) + dtm = DTM(k, implementation="ckdtree") + r0 = dtm.fit_transform(pts) + dtm = DTM(k, implementation="sklearn") + r1 = dtm.fit_transform(pts) + assert r1 == pytest.approx(r0) + dtm = DTM(k, implementation="sklearn", algorithm="brute") + r2 = dtm.fit_transform(pts) + assert r2 == pytest.approx(r0) + dtm = DTM(k, implementation="hnsw") + r3 = dtm.fit_transform(pts) + assert r3 == pytest.approx(r0) from scipy.spatial.distance import cdist - d = cdist(pts,pts) - dtm = DTM(k,metric="precomputed") - print(dtm.fit_transform(d)) - dtm = DTM(k,implementation="keops") - print(dtm.fit_transform(pts)) + d = cdist(pts, pts) + dtm = DTM(k, metric="precomputed") + r4 = dtm.fit_transform(d) + assert r4 == pytest.approx(r0) + dtm = DTM(k, implementation="keops") + r5 = dtm.fit_transform(pts) + assert r5 == pytest.approx(r0) + + +def test_dtm_precomputed(): + dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]]) + dtm = DTM(2, q=1, metric="neighbors") + r = dtm.fit_transform(dist) + assert r == pytest.approx([2.0, 3, 1]) + + dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) + dtm = DTM(2, q=2, metric="neighbors") + r = dtm.fit_transform(dist) + assert r == pytest.approx([2.0, .707, 3.5355], rel=.01) -- cgit v1.2.3 From 03376ffe0f6060864ee8908893297f8800b7b8d1 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 27 Mar 2020 20:27:10 +0100 Subject: doc --- src/python/doc/point_cloud.rst | 17 +++++++++++++++-- src/python/gudhi/point_cloud/dtm.py | 6 +++++- src/python/gudhi/point_cloud/knn.py | 31 ++++++++++++++++++------------- src/python/test/test_dtm.py | 2 +- 4 files changed, 39 insertions(+), 17 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst index c0d4b303..351b0786 100644 --- a/src/python/doc/point_cloud.rst +++ b/src/python/doc/point_cloud.rst @@ -21,10 +21,23 @@ Subsampling :special-members: :show-inheritance: -TimeDelayEmbedding ------------------- +Time Delay Embedding +-------------------- .. autoclass:: gudhi.point_cloud.timedelay.TimeDelayEmbedding :members: :special-members: __call__ +Nearest neighbors +----------------- + +.. automodule:: gudhi.point_cloud.knn + :members: + :special-members: __init__ + +Distance to measure +------------------- + +.. automodule:: gudhi.point_cloud.dtm + :members: + :special-members: __init__ diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 541b74a6..e4096c5e 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -11,11 +11,15 @@ from .knn import KNN class DTM: + """ + Class to compute the distance to the empirical measure defined by a point set. + """ + def __init__(self, k, q=2, **kwargs): """ Args: q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. - kwargs: Same parameters as KNN, except that metric="neighbors" means that transform() expects an array with the distances to the k nearest neighbors. + kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index a4ea3acd..02448530 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,10 @@ import numpy class KNN: + """ + Class wrapping several implementations for computing the k nearest neighbors in a point set. + """ + def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): """ Args: @@ -19,22 +23,17 @@ class KNN: return_distance (bool): if True, return the distance to each neighbor. implementation (str): Choice of the library that does the real work. - * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes - large (10+) but the number of points remains low (less than a million). - Only "minkowski" and its aliases are supported. + * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported. * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. - * 'sklearn' for scikit-learn's NearestNeighbors. - Note that this provides in particular an option algorithm="brute". - * 'hnsw' for hnswlib.Index. It is very fast but does not provide guarantees. - Only supports "euclidean" for now. + * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute". + * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now. * None will try to select a sensible one (scipy if possible, scikit-learn otherwise). metric (str): see `sklearn.neighbors.NearestNeighbors`. eps (float): relative error when computing nearest neighbors with the cKDTree. p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. - - Additional parameters are forwarded to the backends. + kwargs: additional parameters are forwarded to the backends. """ self.k = k self.return_index = return_index @@ -75,20 +74,26 @@ class KNN: if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree + self.kdtree = cKDTree(X) if self.params["implementation"] == "sklearn" and self.metric != "precomputed": # FIXME: sklearn badly handles "precomputed" from sklearn.neighbors import NearestNeighbors - nargs = {k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}} + nargs = { + k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"} + } self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) self.nn.fit(X) if self.params["implementation"] == "hnsw": import hnswlib - self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances - self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}) + + self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances + self.graph.init_index( + len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}} + ) n = self.params.get("num_threads") if n is None: n = self.params.get("n_jobs", 1) @@ -154,7 +159,7 @@ class KNN: p = self.params["p"] if p == numpy.inf: - # Requires a version of pykeops strictly more recent than 1.3 + # Requires pykeops 1.4 or later mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1) elif p == 2: # Any even integer? mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1) diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 841f8c3c..93b13e1a 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -47,4 +47,4 @@ def test_dtm_precomputed(): dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) dtm = DTM(2, q=2, metric="neighbors") r = dtm.fit_transform(dist) - assert r == pytest.approx([2.0, .707, 3.5355], rel=.01) + assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01) -- cgit v1.2.3 From 35a12b553c85af8ce31629b90a27a7071b0cc379 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 11:48:43 +0100 Subject: Doc tweaks, default DTM exponent --- src/python/doc/point_cloud.rst | 6 ++++-- src/python/doc/point_cloud_sum.inc | 4 ++-- src/python/gudhi/point_cloud/dtm.py | 17 ++++++++++++----- src/python/gudhi/point_cloud/knn.py | 6 +++--- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst index 351b0786..192f70db 100644 --- a/src/python/doc/point_cloud.rst +++ b/src/python/doc/point_cloud.rst @@ -28,11 +28,12 @@ Time Delay Embedding :members: :special-members: __call__ -Nearest neighbors ------------------ +K nearest neighbors +------------------- .. automodule:: gudhi.point_cloud.knn :members: + :undoc-members: :special-members: __init__ Distance to measure @@ -40,4 +41,5 @@ Distance to measure .. automodule:: gudhi.point_cloud.dtm :members: + :undoc-members: :special-members: __init__ diff --git a/src/python/doc/point_cloud_sum.inc b/src/python/doc/point_cloud_sum.inc index ecc18951..d4761aba 100644 --- a/src/python/doc/point_cloud_sum.inc +++ b/src/python/doc/point_cloud_sum.inc @@ -2,8 +2,8 @@ :widths: 30 40 30 +----------------------------------------------------------------+------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+ - | | :math:`(x_1, x_2, \ldots, x_d)` | Utilities to process point clouds: read from file, subsample, etc. | :Author: Vincent Rouvreau | - | | :math:`(y_1, y_2, \ldots, y_d)` | | | + | | :math:`(x_1, x_2, \ldots, x_d)` | Utilities to process point clouds: read from file, subsample, | :Authors: Vincent Rouvreau, Marc Glisse, Masatoshi Takenouchi | + | | :math:`(y_1, y_2, \ldots, y_d)` | find neighbors, embed time series in higher dimension, etc. | | | | | :Since: GUDHI 2.0.0 | | | | | | | | :License: MIT (`GPL v3 `_, BSD-3-Clause, Apache-2.0) | diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index e4096c5e..520cbea8 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -15,10 +15,11 @@ class DTM: Class to compute the distance to the empirical measure defined by a point set. """ - def __init__(self, k, q=2, **kwargs): + def __init__(self, k, q=None, **kwargs): """ Args: - q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. + k (int): number of neighbors (possibly including the point itself). + q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k @@ -31,7 +32,7 @@ class DTM: def fit(self, X, y=None): """ Args: - X (numpy.array): coordinates for mass points + X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": # KNN gives sorted distances, which is unnecessary here. @@ -45,11 +46,17 @@ class DTM: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). """ + q = self.q + if q is None: + if self.params["metric"] in {"neighbors", "precomputed"}: + q = 2 + else: + q = len(X[0]) if self.params["metric"] == "neighbors": distances = X[:, : self.k] else: distances = self.knn.transform(X) - distances = distances ** self.q + distances = distances ** q dtm = distances.sum(-1) / self.k - dtm = dtm ** (1.0 / self.q) + dtm = dtm ** (1.0 / q) return dtm diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 02448530..31e4fc9f 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -18,7 +18,7 @@ class KNN: def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): """ Args: - k (int): number of neighbors (including the point itself). + k (int): number of neighbors (possibly including the point itself). return_index (bool): if True, return the index of each neighbor. return_distance (bool): if True, return the distance to each neighbor. implementation (str): Choice of the library that does the real work. @@ -68,7 +68,7 @@ class KNN: def fit(self, X, y=None): """ Args: - X (numpy.array): coordinates for reference points + X (numpy.array): coordinates for reference points. """ self.ref_points = X if self.params["implementation"] == "ckdtree": @@ -105,7 +105,7 @@ class KNN: def transform(self, X): """ Args: - X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed" + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". """ metric = self.metric k = self.k -- cgit v1.2.3 From a911f9707d44259a38ae3dbb6fbcec75779fc639 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:17:29 +0100 Subject: doc --- src/python/gudhi/point_cloud/dtm.py | 2 +- src/python/gudhi/point_cloud/knn.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 520cbea8..3ac69f31 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -20,7 +20,7 @@ class DTM: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". - kwargs: Same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 31e4fc9f..bb7757f2 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -21,7 +21,7 @@ class KNN: k (int): number of neighbors (possibly including the point itself). return_index (bool): if True, return the index of each neighbor. return_distance (bool): if True, return the distance to each neighbor. - implementation (str): Choice of the library that does the real work. + implementation (str): choice of the library that does the real work. * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported. * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. @@ -31,7 +31,7 @@ class KNN: metric (str): see `sklearn.neighbors.NearestNeighbors`. eps (float): relative error when computing nearest neighbors with the cKDTree. p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. - n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. + n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. kwargs: additional parameters are forwarded to the backends. """ -- cgit v1.2.3 From 40f4b6fb1fe20c3843b1fd80f99996e6d25c9426 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:26:36 +0100 Subject: Comment --- src/python/gudhi/point_cloud/dtm.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 3ac69f31..ba011eaf 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -59,4 +59,6 @@ class DTM: distances = distances ** q dtm = distances.sum(-1) / self.k dtm = dtm ** (1.0 / q) + # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller. + # Add option to skip the final root? return dtm -- cgit v1.2.3 From 7f323484acdeafca93efdd9bdd20ed428f8fb95b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 12:45:00 +0100 Subject: Optional sort_results --- src/python/gudhi/point_cloud/dtm.py | 4 +--- src/python/gudhi/point_cloud/knn.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index ba011eaf..678524f2 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -35,9 +35,7 @@ class DTM: X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": - # KNN gives sorted distances, which is unnecessary here. - # Maybe add a parameter to say we don't need sorting? - self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params) + self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) self.knn.fit(X) return self diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index bb7757f2..8369f1f8 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -33,6 +33,9 @@ class KNN: p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. + sort_results (bool): if True, then distances and indices of each point are + sorted on return, so that the first column contains the closest points. + Otherwise, neighbors are returned in an arbitrary order. Defaults to True. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -115,18 +118,22 @@ class KNN: X = numpy.array(X) if self.return_index: neighbors = numpy.argpartition(X, k - 1)[:, 0:k] - distances = numpy.take_along_axis(X, neighbors, axis=-1) - ngb_order = numpy.argsort(distances, axis=-1) - neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + if self.params.get("sort_results", True): + X = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(X, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + else: + ngb_order = neighbors if self.return_distance: - distances = numpy.take_along_axis(distances, ngb_order, axis=-1) + distances = numpy.take_along_axis(X, ngb_order, axis=-1) return neighbors, distances else: return neighbors if self.return_distance: distances = numpy.partition(X, k - 1)[:, 0:k] - # partition is not guaranteed to sort the lower half, although it often does - distances.sort(axis=-1) + if self.params.get("sort_results"): + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) return distances return None -- cgit v1.2.3 From dd9457649d0d197bbed6402200e0f2f55655680e Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 28 Mar 2020 15:39:15 +0100 Subject: Default param of 2 for DTM --- src/python/gudhi/point_cloud/dtm.py | 14 ++++---------- src/python/test/test_dtm.py | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 678524f2..c26ba844 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -15,11 +15,11 @@ class DTM: Class to compute the distance to the empirical measure defined by a point set. """ - def __init__(self, k, q=None, **kwargs): + def __init__(self, k, q=2, **kwargs): """ Args: k (int): number of neighbors (possibly including the point itself). - q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if metric is "neighbors" or "distance_matrix". + q (float): order used to compute the distance to measure. Defaults to 2. kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k @@ -44,19 +44,13 @@ class DTM: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). """ - q = self.q - if q is None: - if self.params["metric"] in {"neighbors", "precomputed"}: - q = 2 - else: - q = len(X[0]) if self.params["metric"] == "neighbors": distances = X[:, : self.k] else: distances = self.knn.transform(X) - distances = distances ** q + distances = distances ** self.q dtm = distances.sum(-1) / self.k - dtm = dtm ** (1.0 / q) + dtm = dtm ** (1.0 / self.q) # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller. # Add option to skip the final root? return dtm diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 33b2f3a2..93b13e1a 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -30,7 +30,7 @@ def test_dtm_compare_euclidean(): from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, q=4, metric="precomputed") + dtm = DTM(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) dtm = DTM(k, implementation="keops") -- cgit v1.2.3 From 8d06fbeae596a0372bf9a921de7d04cc734eaa3b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 30 Mar 2020 08:14:46 +0200 Subject: Biblio --- biblio/bibliography.bib | 15 +++++++++++++++ src/python/gudhi/point_cloud/dtm.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/biblio/bibliography.bib b/biblio/bibliography.bib index 3bbe7960..f9d43638 100644 --- a/biblio/bibliography.bib +++ b/biblio/bibliography.bib @@ -1192,3 +1192,18 @@ numpages = {11}, location = {Montr\'{e}al, Canada}, series = {NIPS’18} } +@Article{dtm, +author={Chazal, Fr{\'e}d{\'e}ric +and Cohen-Steiner, David +and M{\'e}rigot, Quentin}, +title={Geometric Inference for Probability Measures}, +journal={Foundations of Computational Mathematics}, +year={2011}, +volume={11}, +number={6}, +pages={733-751}, +abstract={Data often comes in the form of a point cloud sampled from an unknown compact subset of Euclidean space. The general goal of geometric inference is then to recover geometric and topological features (e.g., Betti numbers, normals) of this subset from the approximating point cloud data. It appears that the study of distance functions allows one to address many of these questions successfully. However, one of the main limitations of this framework is that it does not cope well with outliers or with background noise. In this paper, we show how to extend the framework of distance functions to overcome this problem. Replacing compact subsets by measures, we introduce a notion of distance function to a probability distribution in Rd. These functions share many properties with classical distance functions, which make them suitable for inference purposes. In particular, by considering appropriate level sets of these distance functions, we show that it is possible to reconstruct offsets of sampled shapes with topological guarantees even in the presence of outliers. Moreover, in settings where empirical measures are considered, these functions can be easily evaluated, making them of particular practical interest.}, +issn={1615-3383}, +doi={10.1007/s10208-011-9098-0}, +url={https://doi.org/10.1007/s10208-011-9098-0} +} diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index c26ba844..23c36b88 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -12,7 +12,7 @@ from .knn import KNN class DTM: """ - Class to compute the distance to the empirical measure defined by a point set. + Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`. """ def __init__(self, k, q=2, **kwargs): -- cgit v1.2.3 From f9a933862050ca95b3a96d7a8572d62f7f2205a9 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 11 Apr 2020 18:18:14 +0200 Subject: Use longer names --- src/python/gudhi/point_cloud/dtm.py | 10 +++-- src/python/gudhi/point_cloud/knn.py | 2 +- src/python/test/test_dtm.py | 18 ++++----- src/python/test/test_knn.py | 76 +++++++++++++++++++++++++++---------- 4 files changed, 71 insertions(+), 35 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 23c36b88..38368f29 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -7,10 +7,10 @@ # Modification(s): # - YYYY/MM Author: Description of the modification -from .knn import KNN +from .knn import KNearestNeighbors -class DTM: +class DistanceToMeasure: """ Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`. """ @@ -20,7 +20,7 @@ class DTM: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to 2. - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ self.k = k self.q = q @@ -35,7 +35,9 @@ class DTM: X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": - self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) + self.knn = KNearestNeighbors( + self.k, return_index=False, return_distance=True, sort_results=False, **self.params + ) self.knn.fit(X) return self diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 8369f1f8..6642a3c2 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -10,7 +10,7 @@ import numpy -class KNN: +class KNearestNeighbors: """ Class wrapping several implementations for computing the k nearest neighbors in a point set. """ diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 93b13e1a..37934fdb 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -8,7 +8,7 @@ - YYYY/MM Author: Description of the modification """ -from gudhi.point_cloud.dtm import DTM +from gudhi.point_cloud.dtm import DistanceToMeasure import numpy import pytest @@ -16,35 +16,35 @@ import pytest def test_dtm_compare_euclidean(): pts = numpy.random.rand(1000, 4) k = 3 - dtm = DTM(k, implementation="ckdtree") + dtm = DistanceToMeasure(k, implementation="ckdtree") r0 = dtm.fit_transform(pts) - dtm = DTM(k, implementation="sklearn") + dtm = DistanceToMeasure(k, implementation="sklearn") r1 = dtm.fit_transform(pts) assert r1 == pytest.approx(r0) - dtm = DTM(k, implementation="sklearn", algorithm="brute") + dtm = DistanceToMeasure(k, implementation="sklearn", algorithm="brute") r2 = dtm.fit_transform(pts) assert r2 == pytest.approx(r0) - dtm = DTM(k, implementation="hnsw") + dtm = DistanceToMeasure(k, implementation="hnsw") r3 = dtm.fit_transform(pts) assert r3 == pytest.approx(r0) from scipy.spatial.distance import cdist d = cdist(pts, pts) - dtm = DTM(k, metric="precomputed") + dtm = DistanceToMeasure(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) - dtm = DTM(k, implementation="keops") + dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) def test_dtm_precomputed(): dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]]) - dtm = DTM(2, q=1, metric="neighbors") + dtm = DistanceToMeasure(2, q=1, metric="neighbors") r = dtm.fit_transform(dist) assert r == pytest.approx([2.0, 3, 1]) dist = numpy.array([[2.0, 2], [0, 1], [3, 4]]) - dtm = DTM(2, q=2, metric="neighbors") + dtm = DistanceToMeasure(2, q=2, metric="neighbors") r = dtm.fit_transform(dist) assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01) diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index e455fb48..6aac2006 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -8,7 +8,7 @@ - YYYY/MM Author: Description of the modification """ -from gudhi.point_cloud.knn import KNN +from gudhi.point_cloud.knn import KNearestNeighbors import numpy as np import pytest @@ -16,39 +16,39 @@ import pytest def test_knn_explicit(): base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) query = np.array([[1.0, 1], [2, 2], [4, 4]]) - knn = KNN(2, metric="manhattan", return_distance=True, return_index=True) + knn = KNearestNeighbors(2, metric="manhattan", return_distance=True, return_index=True) knn.fit(base) r = knn.transform(query) assert r[0] == pytest.approx(np.array([[0, 1], [1, 0], [3, 2]])) assert r[1] == pytest.approx(np.array([[0.0, 1], [1, 2], [1, 2]])) - knn = KNN(2, metric="chebyshev", return_distance=True, return_index=False) + knn = KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False) knn.fit(base) r = knn.transform(query) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) r = ( - KNN(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops") + KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops") .fit(base) .transform(query) ) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) - knn = KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True) + knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True) knn.fit(base) r = knn.transform(query) assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) r = ( - KNN(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops") + KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops") .fit(base) .transform(query) ) assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]]) dist = np.array([[0.0, 3, 8], [1, 0, 5], [1, 2, 0]]) - knn = KNN(2, metric="precomputed", return_index=True, return_distance=False) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False) r = knn.fit_transform(dist) assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) - knn = KNN(2, metric="precomputed", return_index=True, return_distance=True) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True) r = knn.fit_transform(dist) assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) @@ -57,16 +57,40 @@ def test_knn_explicit(): def test_knn_compare(): base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]]) query = np.array([[1.0, 1], [2, 2], [4, 4]]) - r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=False).fit(base).transform(query) - r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=False).fit(base).transform(query) - r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query) - r3 = KNN(2, implementation="keops", return_index=True, return_distance=False).fit(base).transform(query) + r0 = ( + KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) + r1 = ( + KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) + r2 = ( + KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query) + ) + r3 = ( + KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=False) + .fit(base) + .transform(query) + ) assert np.array_equal(r0, r1) and np.array_equal(r0, r2) and np.array_equal(r0, r3) - r0 = KNN(2, implementation="ckdtree", return_index=True, return_distance=True).fit(base).transform(query) - r1 = KNN(2, implementation="sklearn", return_index=True, return_distance=True).fit(base).transform(query) - r2 = KNN(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query) - r3 = KNN(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query) + r0 = ( + KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=True) + .fit(base) + .transform(query) + ) + r1 = ( + KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=True) + .fit(base) + .transform(query) + ) + r2 = KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query) + r3 = ( + KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query) + ) assert np.array_equal(r0[0], r1[0]) and np.array_equal(r0[0], r2[0]) and np.array_equal(r0[0], r3[0]) d0 = pytest.approx(r0[1]) assert r1[1] == d0 and r2[1] == d0 and r3[1] == d0 @@ -75,8 +99,18 @@ def test_knn_compare(): def test_knn_nop(): # This doesn't look super useful... p = np.array([[0.0]]) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="sklearn").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="ckdtree").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5).fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, implementation="keops").fit_transform(p) - assert None is KNN(k=1, return_index=False, return_distance=False, metric="precomputed").fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="sklearn" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="ckdtree" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5 + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, implementation="keops" + ).fit_transform(p) + assert None is KNearestNeighbors( + k=1, return_index=False, return_distance=False, metric="precomputed" + ).fit_transform(p) -- cgit v1.2.3 From 83a1bc1fb6124a35d515f4836d2e830f3dbdf0e7 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sun, 12 Apr 2020 21:57:51 +0200 Subject: Parallelize the "precomputed" case of knn It is supposed to be possible to compile numpy with openmp, but it looks like it isn't done in any of the usual packages. It may be possible to refactor that code so there is less redundancy. --- src/python/gudhi/point_cloud/knn.py | 78 +++++++++++++++++++++++++++++-------- src/python/test/test_dtm.py | 3 ++ src/python/test/test_knn.py | 8 ++++ 3 files changed, 73 insertions(+), 16 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 6642a3c2..f6870517 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -115,25 +115,71 @@ class KNearestNeighbors: if metric == "precomputed": # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural. - X = numpy.array(X) if self.return_index: - neighbors = numpy.argpartition(X, k - 1)[:, 0:k] - if self.params.get("sort_results", True): - X = numpy.take_along_axis(X, neighbors, axis=-1) - ngb_order = numpy.argsort(X, axis=-1) - neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + n_jobs = self.params.get("n_jobs", 1) + # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?! + if n_jobs == 1: + neighbors = numpy.argpartition(X, k - 1)[:, 0:k] + if self.params.get("sort_results", True): + X = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(X, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + else: + ngb_order = neighbors + if self.return_distance: + distances = numpy.take_along_axis(X, ngb_order, axis=-1) + return neighbors, distances + else: + return neighbors else: - ngb_order = neighbors - if self.return_distance: - distances = numpy.take_along_axis(X, ngb_order, axis=-1) - return neighbors, distances - else: - return neighbors + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + slices = gen_even_slices(len(X), effective_n_jobs(-1)) + parallel = Parallel(backend="threading", n_jobs=-1) + if self.params.get("sort_results", True): + + def func(M): + neighbors = numpy.argpartition(M, k - 1)[:, 0:k] + Y = numpy.take_along_axis(M, neighbors, axis=-1) + ngb_order = numpy.argsort(Y, axis=-1) + return numpy.take_along_axis(neighbors, ngb_order, axis=-1) + + else: + + def func(M): + return numpy.argpartition(M, k - 1)[:, 0:k] + + neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) + if self.return_distance: + distances = numpy.take_along_axis(X, neighbors, axis=-1) + return neighbors, distances + else: + return neighbors if self.return_distance: - distances = numpy.partition(X, k - 1)[:, 0:k] - if self.params.get("sort_results"): - # partition is not guaranteed to sort the lower half, although it often does - distances.sort(axis=-1) + n_jobs = self.params.get("n_jobs", 1) + if n_jobs == 1: + distances = numpy.partition(X, k - 1)[:, 0:k] + if self.params.get("sort_results"): + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) + else: + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + if self.params.get("sort_results"): + + def func(M): + # Not partitioning in place, because we should not modify the user's array? + r = numpy.partition(M, k - 1)[:, 0:k] + r.sort(axis=-1) + return r + + else: + func = lambda M: numpy.partition(M, k - 1)[:, 0:k] + slices = gen_even_slices(len(X), effective_n_jobs(-1)) + parallel = Parallel(backend="threading", n_jobs=-1) + distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) return distances return None diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index 37934fdb..bc0d3698 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -33,6 +33,9 @@ def test_dtm_compare_euclidean(): dtm = DistanceToMeasure(k, metric="precomputed") r4 = dtm.fit_transform(d) assert r4 == pytest.approx(r0) + dtm = DistanceToMeasure(k, metric="precomputed", n_jobs=2) + r4b = dtm.fit_transform(d) + assert r4b == pytest.approx(r0) dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index 6aac2006..6269df54 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -52,6 +52,14 @@ def test_knn_explicit(): r = knn.fit_transform(dist) assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) + # Second time in parallel + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2) + r = knn.fit_transform(dist) + assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]]) + knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, n_jobs=2) + r = knn.fit_transform(dist) + assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]]) + assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]]) def test_knn_compare(): -- cgit v1.2.3 From 280eb9d2323837619db1ae013b929adb9b45013b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 01:09:45 +0200 Subject: enable_autodiff with keops This doesn't seem like the best way to handle it, we may want to handle it like a wrapper that gets the indices from knn (whatever backend) and then computes the distances. --- src/python/gudhi/point_cloud/knn.py | 33 +++++++++++++++++++++++++++++---- src/python/test/test_dtm.py | 8 ++++++++ src/python/test/test_knn.py | 6 ++++++ 3 files changed, 43 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index f6870517..79362c09 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -36,6 +36,9 @@ class KNearestNeighbors: sort_results (bool): if True, then distances and indices of each point are sorted on return, so that the first column contains the closest points. Otherwise, neighbors are returned in an arbitrary order. Defaults to True. + enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.array or similar, this instructs + the function to compute distances in a way that works with automatic differentiation. + This is experimental and not supported for all implementations. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -202,13 +205,18 @@ class KNearestNeighbors: if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor + import eagerpy as ep # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.tensor(X, dtype=torch.float32) - if X is self.ref_points: + queries = X + X = ep.astensor(X) + XX = torch.as_tensor(X.numpy(), dtype=torch.float32) + if queries is self.ref_points: + Y = X YY = XX else: - YY = torch.tensor(self.ref_points, dtype=torch.float32) + Y = ep.astensor(self.ref_points) + YY = torch.as_tensor(Y.numpy(), dtype=torch.float32) p = self.params["p"] if p == numpy.inf: @@ -219,6 +227,24 @@ class KNearestNeighbors: else: mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) + # pykeops does not support autodiff for kmin yet :-( + if self.params.get("enable_autodiff", False) and self.return_distance: + # Compute the indices of the neighbors, and recompute the relevant distances autodiff-friendly. + # Another strategy would be to compute the whole distance matrix with torch.cdist + # and use neighbors as indices into it. + neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() + neighbor_pts = Y[neighbors] + diff = neighbor_pts - X[:, None, :] + if p == numpy.inf: + distances = diff.abs().max(-1) + elif p == 2: + distances = (diff ** 2).sum(-1) ** 0.5 + else: + distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) + if self.return_index: + return neighbors.raw, distances.raw + else: + return distances.raw if self.return_index: if self.return_distance: distances, neighbors = mat.Kmin_argKmin(k, dim=1) @@ -234,7 +260,6 @@ class KNearestNeighbors: distances = distances ** (1.0 / p) return distances return None - # FIXME: convert everything back to numpy arrays or not? if self.params["implementation"] == "ckdtree": qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py index bc0d3698..8709dd07 100755 --- a/src/python/test/test_dtm.py +++ b/src/python/test/test_dtm.py @@ -11,6 +11,7 @@ from gudhi.point_cloud.dtm import DistanceToMeasure import numpy import pytest +import torch def test_dtm_compare_euclidean(): @@ -39,6 +40,13 @@ def test_dtm_compare_euclidean(): dtm = DistanceToMeasure(k, implementation="keops") r5 = dtm.fit_transform(pts) assert r5 == pytest.approx(r0) + pts2 = torch.tensor(pts, requires_grad=True) + assert pts2.grad is None + dtm = DistanceToMeasure(k, implementation="keops", enable_autodiff=True) + r6 = dtm.fit_transform(pts2) + assert r6.detach().numpy() == pytest.approx(r0) + r6.sum().backward() + assert pts2.grad is not None def test_dtm_precomputed(): diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py index 6269df54..415c9d48 100755 --- a/src/python/test/test_knn.py +++ b/src/python/test/test_knn.py @@ -32,6 +32,12 @@ def test_knn_explicit(): .transform(query) ) assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) + r = ( + KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops", enable_autodiff=True) + .fit(base) + .transform(query) + ) + assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]])) knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True) knn.fit(base) -- cgit v1.2.3 From 2f1576a23cf4ac055565875d384ca604c0ff6844 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 15:01:51 +0200 Subject: Small autodiff tweaks --- src/python/gudhi/point_cloud/knn.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 79362c09..ab3447d4 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -233,16 +233,17 @@ class KNearestNeighbors: # Another strategy would be to compute the whole distance matrix with torch.cdist # and use neighbors as indices into it. neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() - neighbor_pts = Y[neighbors] + # Work around https://github.com/pytorch/pytorch/issues/34452 + neighbor_pts = Y[neighbors,] diff = neighbor_pts - X[:, None, :] if p == numpy.inf: distances = diff.abs().max(-1) elif p == 2: - distances = (diff ** 2).sum(-1) ** 0.5 + distances = (diff ** 2).sum(-1).sqrt() else: distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) if self.return_index: - return neighbors.raw, distances.raw + return neighbors, distances.raw else: return distances.raw if self.return_index: -- cgit v1.2.3 From 3a86402b733a48d9c25a4995325e72c7438c06c0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 15:21:06 +0200 Subject: Fix NaN gradient with pytorch --- src/python/gudhi/point_cloud/knn.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index ab3447d4..185a7764 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -236,12 +236,11 @@ class KNearestNeighbors: # Work around https://github.com/pytorch/pytorch/issues/34452 neighbor_pts = Y[neighbors,] diff = neighbor_pts - X[:, None, :] - if p == numpy.inf: - distances = diff.abs().max(-1) - elif p == 2: - distances = (diff ** 2).sum(-1).sqrt() + if isinstance(diff, ep.PyTorchTensor): + # https://github.com/jonasrauber/eagerpy/issues/6 + distances = ep.astensor(diff.raw.norm(p, -1)) else: - distances = (diff.abs() ** p).sum(-1) ** (1.0 / p) + distances = diff.norms.lp(p, -1) if self.return_index: return neighbors, distances.raw else: -- cgit v1.2.3 From 3afce326428dddd638e22ab37ee4b2afe52eba75 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 20:32:39 +0200 Subject: Generalize enable_autodiff to more implementations Still limited to L^p --- src/python/gudhi/point_cloud/knn.py | 76 +++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 21 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 185a7764..87b2798e 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -9,6 +9,7 @@ import numpy +# TODO: https://github.com/facebookresearch/faiss class KNearestNeighbors: """ @@ -67,6 +68,8 @@ class KNearestNeighbors: self.params["implementation"] = "ckdtree" else: self.params["implementation"] = "sklearn" + if not return_distance: + self.params["enable_autodiff"] = False def fit_transform(self, X, y=None): return self.fit(X).transform(X) @@ -77,6 +80,10 @@ class KNearestNeighbors: X (numpy.array): coordinates for reference points. """ self.ref_points = X + if self.params.get("enable_autodiff", False): + import eagerpy as ep + if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): + X = ep.astensor(X).numpy() if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree @@ -113,6 +120,41 @@ class KNearestNeighbors: Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". """ + if self.params.get("enable_autodiff", False): + # pykeops does not support autodiff for kmin yet, but when it does in the future, + # we may want a special path. + import eagerpy as ep + save_return_index = self.return_index + self.return_index = True + self.return_distance = False + self.params["enable_autodiff"] = False + try: + # FIXME: how do we test "X is ref_points" then? + newX = ep.astensor(X) + if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): + newX = newX.numpy() + neighbors = self.transform(newX) + finally: + self.return_index = save_return_index + self.return_distance = True + self.params["enable_autodiff"] = True + # We can implement more later as needed + assert self.metric == "minkowski" + p = self.params["p"] + Y = ep.astensor(self.ref_points) + neighbor_pts = Y[neighbors,] + diff = neighbor_pts - X[:, None, :] + if isinstance(diff, ep.PyTorchTensor): + # https://github.com/jonasrauber/eagerpy/issues/6 + distances = ep.astensor(diff.raw.norm(p, -1)) + else: + distances = diff.norms.lp(p, -1) + if self.return_index: + return neighbors, distances.raw + else: + return distances.raw + + metric = self.metric k = self.k @@ -207,16 +249,26 @@ class KNearestNeighbors: from pykeops.torch import LazyTensor import eagerpy as ep - # 'float64' is slow except on super expensive GPUs. Allow it with some param? queries = X X = ep.astensor(X) - XX = torch.as_tensor(X.numpy(), dtype=torch.float32) + if isinstance(X, ep.PyTorchTensor): + XX = X.raw + else: + # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch + # without copying to/from the CPU. + XX = X.numpy() + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.as_tensor(XX, dtype=torch.float32) if queries is self.ref_points: Y = X YY = XX else: Y = ep.astensor(self.ref_points) - YY = torch.as_tensor(Y.numpy(), dtype=torch.float32) + if isinstance(Y, ep.PyTorchTensor): + YY = Y.raw + else: + YY = Y.numpy() + YY = torch.as_tensor(YY, dtype=torch.float32) p = self.params["p"] if p == numpy.inf: @@ -227,24 +279,6 @@ class KNearestNeighbors: else: mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) - # pykeops does not support autodiff for kmin yet :-( - if self.params.get("enable_autodiff", False) and self.return_distance: - # Compute the indices of the neighbors, and recompute the relevant distances autodiff-friendly. - # Another strategy would be to compute the whole distance matrix with torch.cdist - # and use neighbors as indices into it. - neighbors = ep.astensor(mat.argKmin(k, dim=1)).numpy() - # Work around https://github.com/pytorch/pytorch/issues/34452 - neighbor_pts = Y[neighbors,] - diff = neighbor_pts - X[:, None, :] - if isinstance(diff, ep.PyTorchTensor): - # https://github.com/jonasrauber/eagerpy/issues/6 - distances = ep.astensor(diff.raw.norm(p, -1)) - else: - distances = diff.norms.lp(p, -1) - if self.return_index: - return neighbors, distances.raw - else: - return distances.raw if self.return_index: if self.return_distance: distances, neighbors = mat.Kmin_argKmin(k, dim=1) -- cgit v1.2.3 From 521d8c17c2b7d71c46a51f0490ff2c13c809fc87 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 21:13:19 +0200 Subject: Remove left-over code eagerpy is only used with enable_autodiff --- src/python/gudhi/point_cloud/knn.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 87b2798e..f2cddb38 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -82,8 +82,11 @@ class KNearestNeighbors: self.ref_points = X if self.params.get("enable_autodiff", False): import eagerpy as ep + X = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): - X = ep.astensor(X).numpy() + # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch + # without copying to/from the CPU. + X = X.numpy() if self.params["implementation"] == "ckdtree": # sklearn could handle this, but it is much slower from scipy.spatial import cKDTree @@ -133,6 +136,8 @@ class KNearestNeighbors: newX = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): newX = newX.numpy() + else: + newX = X neighbors = self.transform(newX) finally: self.return_index = save_return_index @@ -247,29 +252,13 @@ class KNearestNeighbors: if self.params["implementation"] == "keops": import torch from pykeops.torch import LazyTensor - import eagerpy as ep - queries = X - X = ep.astensor(X) - if isinstance(X, ep.PyTorchTensor): - XX = X.raw - else: - # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch - # without copying to/from the CPU. - XX = X.numpy() # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.as_tensor(XX, dtype=torch.float32) - if queries is self.ref_points: - Y = X + XX = torch.as_tensor(X, dtype=torch.float32) + if X is self.ref_points: YY = XX else: - Y = ep.astensor(self.ref_points) - if isinstance(Y, ep.PyTorchTensor): - YY = Y.raw - else: - YY = Y.numpy() - YY = torch.as_tensor(YY, dtype=torch.float32) - + YY = torch.as_tensor(self.ref_points, dtype=torch.float32) p = self.params["p"] if p == numpy.inf: # Requires pykeops 1.4 or later -- cgit v1.2.3 From ce75f66da5a2d7ad2c479355112d48817c5ba68b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 13 Apr 2020 21:38:24 +0200 Subject: Tweak to detect fit_transform --- src/python/gudhi/point_cloud/knn.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index f2cddb38..8b3cdb46 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,7 @@ import numpy # TODO: https://github.com/facebookresearch/faiss + class KNearestNeighbors: """ Class wrapping several implementations for computing the k nearest neighbors in a point set. @@ -82,6 +83,7 @@ class KNearestNeighbors: self.ref_points = X if self.params.get("enable_autodiff", False): import eagerpy as ep + X = ep.astensor(X) if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch @@ -127,17 +129,19 @@ class KNearestNeighbors: # pykeops does not support autodiff for kmin yet, but when it does in the future, # we may want a special path. import eagerpy as ep + save_return_index = self.return_index self.return_index = True self.return_distance = False self.params["enable_autodiff"] = False try: - # FIXME: how do we test "X is ref_points" then? newX = ep.astensor(X) - if self.params["implementation"] != "keops" or not isinstance(newX, ep.PyTorchTensor): + if self.params["implementation"] != "keops" or ( + not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor) + ): newX = newX.numpy() else: - newX = X + newX = newX.raw neighbors = self.transform(newX) finally: self.return_index = save_return_index @@ -159,7 +163,6 @@ class KNearestNeighbors: else: return distances.raw - metric = self.metric k = self.k -- cgit v1.2.3 From 9518287cfa2a62948ede2e7d17d5c9f29092e0f4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 14 Apr 2020 18:27:19 +0200 Subject: Doc improvements --- src/python/gudhi/point_cloud/dtm.py | 12 ++++++++++-- src/python/gudhi/point_cloud/knn.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 38368f29..58dec536 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -20,7 +20,9 @@ class DistanceToMeasure: Args: k (int): number of neighbors (possibly including the point itself). q (float): order used to compute the distance to measure. Defaults to 2. - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that + metric="neighbors" means that :func:`transform` expects an array with the distances + to the k nearest neighbors. """ self.k = k self.q = q @@ -44,7 +46,13 @@ class DistanceToMeasure: def transform(self, X): """ Args: - X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored). + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", + or distances to the k nearest neighbors if metric is "neighbors" (if the array has more + than k columns, the remaining ones are ignored). + + Returns: + numpy.array: a 1-d array with, for each point of X, its distance to the measure defined + by the argument of :func:`fit`. """ if self.params["metric"] == "neighbors": distances = X[:, : self.k] diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index 8b3cdb46..d7cf0b2a 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -38,9 +38,9 @@ class KNearestNeighbors: sort_results (bool): if True, then distances and indices of each point are sorted on return, so that the first column contains the closest points. Otherwise, neighbors are returned in an arbitrary order. Defaults to True. - enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.array or similar, this instructs - the function to compute distances in a way that works with automatic differentiation. - This is experimental and not supported for all implementations. + enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this + instructs the function to compute distances in a way that works with automatic differentiation. + This is experimental and not supported for all metrics. Defaults to False. kwargs: additional parameters are forwarded to the backends. """ self.k = k @@ -124,6 +124,11 @@ class KNearestNeighbors: """ Args: X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". + + Returns: + numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument + of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the + same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order. """ if self.params.get("enable_autodiff", False): # pykeops does not support autodiff for kmin yet, but when it does in the future, -- cgit v1.2.3 From 1c1a99074049e4ff04fa28e7d6e1b6fc2067397a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 20 Apr 2020 10:38:41 +0200 Subject: Add __license__ --- src/python/gudhi/point_cloud/dtm.py | 4 ++++ src/python/gudhi/point_cloud/knn.py | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index 58dec536..13e16d24 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -9,6 +9,10 @@ from .knn import KNearestNeighbors +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + class DistanceToMeasure: """ diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py index d7cf0b2a..4017e498 100644 --- a/src/python/gudhi/point_cloud/knn.py +++ b/src/python/gudhi/point_cloud/knn.py @@ -11,6 +11,10 @@ import numpy # TODO: https://github.com/facebookresearch/faiss +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + class KNearestNeighbors: """ @@ -156,7 +160,9 @@ class KNearestNeighbors: assert self.metric == "minkowski" p = self.params["p"] Y = ep.astensor(self.ref_points) - neighbor_pts = Y[neighbors,] + neighbor_pts = Y[ + neighbors, + ] diff = neighbor_pts - X[:, None, :] if isinstance(diff, ep.PyTorchTensor): # https://github.com/jonasrauber/eagerpy/issues/6 -- cgit v1.2.3 From 3a9105e0d3bea5cc64610b7c0c3fb15f0e00bb9d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 20 Apr 2020 11:37:44 +0200 Subject: Reintroduce _proj_on_diag, with a unit test --- src/python/gudhi/wasserstein/wasserstein.py | 11 +++++++++++ src/python/test/test_wasserstein_distance.py | 7 +++++++ 2 files changed, 18 insertions(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/wasserstein/wasserstein.py b/src/python/gudhi/wasserstein/wasserstein.py index 5df66cf9..efc851a0 100644 --- a/src/python/gudhi/wasserstein/wasserstein.py +++ b/src/python/gudhi/wasserstein/wasserstein.py @@ -15,6 +15,17 @@ try: except ImportError: print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT") + +# Currently unused, but Théo says it is likely to be used again. +def _proj_on_diag(X): + ''' + :param X: (n x 2) array encoding the points of a persistent diagram. + :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal + ''' + Z = (X[:,0] + X[:,1]) / 2. + return np.array([Z , Z]).T + + def _dist_to_diag(X, internal_p): ''' :param X: (n x 2) array encoding the points of a persistent diagram. diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py index 7e0d0f5f..1a4acc1d 100755 --- a/src/python/test/test_wasserstein_distance.py +++ b/src/python/test/test_wasserstein_distance.py @@ -8,6 +8,7 @@ - YYYY/MM Author: Description of the modification """ +from gudhi.wasserstein.wasserstein import _proj_on_diag from gudhi.wasserstein import wasserstein_distance as pot from gudhi.hera import wasserstein_distance as hera import numpy as np @@ -17,6 +18,12 @@ __author__ = "Theo Lacombe" __copyright__ = "Copyright (C) 2019 Inria" __license__ = "MIT" +def test_proj_on_diag(): + dgm = np.array([[1., 1.], [1., 2.], [3., 5.]]) + assert np.array_equal(_proj_on_diag(dgm), [[1., 1.], [1.5, 1.5], [4., 4.]]) + empty = np.empty((0, 2)) + assert np.array_equal(_proj_on_diag(empty), empty) + def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_matching=True): diag1 = np.array([[2.7, 3.7], [9.6, 14.0], [34.2, 34.974]]) diag2 = np.array([[2.8, 4.45], [9.5, 14.1]]) -- cgit v1.2.3