summaryrefslogtreecommitdiff
path: root/src/python/gudhi/point_cloud
diff options
context:
space:
mode:
authorMarc Glisse <marc.glisse@inria.fr>2020-04-29 23:00:17 +0200
committerMarc Glisse <marc.glisse@inria.fr>2020-04-29 23:00:17 +0200
commit73a74011e4b5af0794f0463295beca924d32e0ee (patch)
treefb825dc77e9984594109f9ea48838d0544dfca1e /src/python/gudhi/point_cloud
parent74155081bb8b3330c562d5c40d7f0a32fc188012 (diff)
parent0bba67db83f33ff608366057d9c4f005fa6a514b (diff)
Merge remote-tracking branch 'origin/master' into dtmdensity
Diffstat (limited to 'src/python/gudhi/point_cloud')
-rw-r--r--src/python/gudhi/point_cloud/dtm.py24
-rw-r--r--src/python/gudhi/point_cloud/knn.py151
2 files changed, 149 insertions, 26 deletions
diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py
index e12eefa1..c5405526 100644
--- a/src/python/gudhi/point_cloud/dtm.py
+++ b/src/python/gudhi/point_cloud/dtm.py
@@ -7,11 +7,15 @@
# Modification(s):
# - YYYY/MM Author: Description of the modification
-from .knn import KNN
+from .knn import KNearestNeighbors
import numpy as np
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
-class DTM:
+
+class DistanceToMeasure:
"""
Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`.
"""
@@ -21,7 +25,9 @@ class DTM:
Args:
k (int): number of neighbors (possibly including the point itself).
q (float): order used to compute the distance to measure. Defaults to 2.
- kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors.
+ kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
+ metric="neighbors" means that :func:`transform` expects an array with the distances
+ to the k nearest neighbors.
"""
self.k = k
self.q = q
@@ -36,14 +42,22 @@ class DTM:
X (numpy.array): coordinates for mass points.
"""
if self.params.setdefault("metric", "euclidean") != "neighbors":
- self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params)
+ self.knn = KNearestNeighbors(
+ self.k, return_index=False, return_distance=True, sort_results=False, **self.params
+ )
self.knn.fit(X)
return self
def transform(self, X):
"""
Args:
- X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored).
+ X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
+ or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
+ than k columns, the remaining ones are ignored).
+
+ Returns:
+ numpy.array: a 1-d array with, for each point of X, its distance to the measure defined
+ by the argument of :func:`fit`.
"""
if self.params["metric"] == "neighbors":
distances = X[:, : self.k]
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
index 8369f1f8..07553d6d 100644
--- a/src/python/gudhi/point_cloud/knn.py
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -9,8 +9,14 @@
import numpy
+# TODO: https://github.com/facebookresearch/faiss
-class KNN:
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class KNearestNeighbors:
"""
Class wrapping several implementations for computing the k nearest neighbors in a point set.
"""
@@ -36,6 +42,10 @@ class KNN:
sort_results (bool): if True, then distances and indices of each point are
sorted on return, so that the first column contains the closest points.
Otherwise, neighbors are returned in an arbitrary order. Defaults to True.
+ enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this
+ instructs the function to compute distances in a way that works with automatic differentiation.
+ This is experimental, not supported for all metrics, and requires the package EagerPy.
+ Defaults to False.
kwargs: additional parameters are forwarded to the backends.
"""
self.k = k
@@ -64,6 +74,8 @@ class KNN:
self.params["implementation"] = "ckdtree"
else:
self.params["implementation"] = "sklearn"
+ if not return_distance:
+ self.params["enable_autodiff"] = False
def fit_transform(self, X, y=None):
return self.fit(X).transform(X)
@@ -74,6 +86,14 @@ class KNN:
X (numpy.array): coordinates for reference points.
"""
self.ref_points = X
+ if self.params.get("enable_autodiff", False):
+ import eagerpy as ep
+
+ X = ep.astensor(X)
+ if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor):
+ # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch
+ # without copying to/from the CPU.
+ X = X.numpy()
if self.params["implementation"] == "ckdtree":
# sklearn could handle this, but it is much slower
from scipy.spatial import cKDTree
@@ -109,31 +129,122 @@ class KNN:
"""
Args:
X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed".
+
+ Returns:
+ numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument
+ of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the
+ same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order.
"""
+ if self.params.get("enable_autodiff", False):
+ # pykeops does not support autodiff for kmin yet, but when it does in the future,
+ # we may want a special path.
+ import eagerpy as ep
+
+ save_return_index = self.return_index
+ self.return_index = True
+ self.return_distance = False
+ self.params["enable_autodiff"] = False
+ try:
+ newX = ep.astensor(X)
+ if self.params["implementation"] != "keops" or (
+ not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor)
+ ):
+ newX = newX.numpy()
+ else:
+ newX = newX.raw
+ neighbors = self.transform(newX)
+ finally:
+ self.return_index = save_return_index
+ self.return_distance = True
+ self.params["enable_autodiff"] = True
+ # We can implement more later as needed
+ assert self.metric == "minkowski"
+ p = self.params["p"]
+ Y = ep.astensor(self.ref_points)
+ neighbor_pts = Y[
+ neighbors,
+ ]
+ diff = neighbor_pts - X[:, None, :]
+ if isinstance(diff, ep.PyTorchTensor):
+ # https://github.com/jonasrauber/eagerpy/issues/6
+ distances = ep.astensor(diff.raw.norm(p, -1))
+ else:
+ distances = diff.norms.lp(p, -1)
+ if self.return_index:
+ return neighbors, distances.raw
+ else:
+ return distances.raw
+
metric = self.metric
k = self.k
if metric == "precomputed":
# scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
- X = numpy.array(X)
if self.return_index:
- neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
- if self.params.get("sort_results", True):
- X = numpy.take_along_axis(X, neighbors, axis=-1)
- ngb_order = numpy.argsort(X, axis=-1)
- neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
- else:
- ngb_order = neighbors
- if self.return_distance:
- distances = numpy.take_along_axis(X, ngb_order, axis=-1)
- return neighbors, distances
+ n_jobs = self.params.get("n_jobs", 1)
+ # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?!
+ if n_jobs == 1:
+ neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results", True):
+ X = numpy.take_along_axis(X, neighbors, axis=-1)
+ ngb_order = numpy.argsort(X, axis=-1)
+ neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+ else:
+ ngb_order = neighbors
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, ngb_order, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
else:
- return neighbors
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ slices = gen_even_slices(len(X), effective_n_jobs(-1))
+ parallel = Parallel(backend="threading", n_jobs=-1)
+ if self.params.get("sort_results", True):
+
+ def func(M):
+ neighbors = numpy.argpartition(M, k - 1)[:, 0:k]
+ Y = numpy.take_along_axis(M, neighbors, axis=-1)
+ ngb_order = numpy.argsort(Y, axis=-1)
+ return numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+
+ else:
+
+ def func(M):
+ return numpy.argpartition(M, k - 1)[:, 0:k]
+
+ neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, neighbors, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
if self.return_distance:
- distances = numpy.partition(X, k - 1)[:, 0:k]
- if self.params.get("sort_results"):
- # partition is not guaranteed to sort the lower half, although it often does
- distances.sort(axis=-1)
+ n_jobs = self.params.get("n_jobs", 1)
+ if n_jobs == 1:
+ distances = numpy.partition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results"):
+ # partition is not guaranteed to sort the lower half, although it often does
+ distances.sort(axis=-1)
+ else:
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ if self.params.get("sort_results"):
+
+ def func(M):
+ # Not partitioning in place, because we should not modify the user's array?
+ r = numpy.partition(M, k - 1)[:, 0:k]
+ r.sort(axis=-1)
+ return r
+
+ else:
+ func = lambda M: numpy.partition(M, k - 1)[:, 0:k]
+ slices = gen_even_slices(len(X), effective_n_jobs(-1))
+ parallel = Parallel(backend="threading", n_jobs=-1)
+ distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
return distances
return None
@@ -158,12 +269,11 @@ class KNN:
from pykeops.torch import LazyTensor
# 'float64' is slow except on super expensive GPUs. Allow it with some param?
- XX = torch.tensor(X, dtype=torch.float32)
+ XX = torch.as_tensor(X, dtype=torch.float32)
if X is self.ref_points:
YY = XX
else:
- YY = torch.tensor(self.ref_points, dtype=torch.float32)
-
+ YY = torch.as_tensor(self.ref_points, dtype=torch.float32)
p = self.params["p"]
if p == numpy.inf:
# Requires pykeops 1.4 or later
@@ -188,7 +298,6 @@ class KNN:
distances = distances ** (1.0 / p)
return distances
return None
- # FIXME: convert everything back to numpy arrays or not?
if self.params["implementation"] == "ckdtree":
qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}}