summaryrefslogtreecommitdiff
path: root/src/python/gudhi/point_cloud
diff options
context:
space:
mode:
Diffstat (limited to 'src/python/gudhi/point_cloud')
-rw-r--r--src/python/gudhi/point_cloud/__init__.py0
-rw-r--r--src/python/gudhi/point_cloud/dtm.py70
-rw-r--r--src/python/gudhi/point_cloud/knn.py328
-rw-r--r--src/python/gudhi/point_cloud/timedelay.py94
4 files changed, 492 insertions, 0 deletions
diff --git a/src/python/gudhi/point_cloud/__init__.py b/src/python/gudhi/point_cloud/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/python/gudhi/point_cloud/__init__.py
diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py
new file mode 100644
index 00000000..13e16d24
--- /dev/null
+++ b/src/python/gudhi/point_cloud/dtm.py
@@ -0,0 +1,70 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+from .knn import KNearestNeighbors
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class DistanceToMeasure:
+ """
+ Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`.
+ """
+
+ def __init__(self, k, q=2, **kwargs):
+ """
+ Args:
+ k (int): number of neighbors (possibly including the point itself).
+ q (float): order used to compute the distance to measure. Defaults to 2.
+ kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
+ metric="neighbors" means that :func:`transform` expects an array with the distances
+ to the k nearest neighbors.
+ """
+ self.k = k
+ self.q = q
+ self.params = kwargs
+
+ def fit_transform(self, X, y=None):
+ return self.fit(X).transform(X)
+
+ def fit(self, X, y=None):
+ """
+ Args:
+ X (numpy.array): coordinates for mass points.
+ """
+ if self.params.setdefault("metric", "euclidean") != "neighbors":
+ self.knn = KNearestNeighbors(
+ self.k, return_index=False, return_distance=True, sort_results=False, **self.params
+ )
+ self.knn.fit(X)
+ return self
+
+ def transform(self, X):
+ """
+ Args:
+ X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
+ or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
+ than k columns, the remaining ones are ignored).
+
+ Returns:
+ numpy.array: a 1-d array with, for each point of X, its distance to the measure defined
+ by the argument of :func:`fit`.
+ """
+ if self.params["metric"] == "neighbors":
+ distances = X[:, : self.k]
+ else:
+ distances = self.knn.transform(X)
+ distances = distances ** self.q
+ dtm = distances.sum(-1) / self.k
+ dtm = dtm ** (1.0 / self.q)
+ # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller.
+ # Add option to skip the final root?
+ return dtm
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
new file mode 100644
index 00000000..86008bc3
--- /dev/null
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -0,0 +1,328 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+import numpy
+
+# TODO: https://github.com/facebookresearch/faiss
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class KNearestNeighbors:
+ """
+ Class wrapping several implementations for computing the k nearest neighbors in a point set.
+
+ :Requires: `PyKeOps <installation.html#pykeops>`_, `SciPy <installation.html#scipy>`_,
+ `Scikit-learn <installation.html#scikit-learn>`_, and/or `Hnswlib <installation.html#hnswlib>`_
+ in function of the selected `implementation`.
+ """
+
+ def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs):
+ """
+ Args:
+ k (int): number of neighbors (possibly including the point itself).
+ return_index (bool): if True, return the index of each neighbor.
+ return_distance (bool): if True, return the distance to each neighbor.
+ implementation (str): choice of the library that does the real work.
+
+ * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported.
+ * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported.
+ * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute".
+ * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now.
+ * None will try to select a sensible one (scipy if possible, scikit-learn otherwise).
+ metric (str): see `sklearn.neighbors.NearestNeighbors`.
+ eps (float): relative error when computing nearest neighbors with the cKDTree.
+ p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2.
+ n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU.
+ If -1 is given all processors are used. Default: 1.
+ sort_results (bool): if True, then distances and indices of each point are
+ sorted on return, so that the first column contains the closest points.
+ Otherwise, neighbors are returned in an arbitrary order. Defaults to True.
+ enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this
+ instructs the function to compute distances in a way that works with automatic differentiation.
+ This is experimental, not supported for all metrics, and requires the package EagerPy.
+ Defaults to False.
+ kwargs: additional parameters are forwarded to the backends.
+ """
+ self.k = k
+ self.return_index = return_index
+ self.return_distance = return_distance
+ self.metric = metric
+ self.params = kwargs
+ # canonicalize
+ if metric == "euclidean":
+ self.params["p"] = 2
+ self.metric = "minkowski"
+ elif metric == "manhattan":
+ self.params["p"] = 1
+ self.metric = "minkowski"
+ elif metric == "chebyshev":
+ self.params["p"] = numpy.inf
+ self.metric = "minkowski"
+ elif metric == "minkowski":
+ self.params["p"] = kwargs.get("p", 2)
+ if self.params.get("implementation") in {"keops", "ckdtree"}:
+ assert self.metric == "minkowski"
+ if self.params.get("implementation") == "hnsw":
+ assert self.metric == "minkowski" and self.params["p"] == 2
+ if not self.params.get("implementation"):
+ if self.metric == "minkowski":
+ self.params["implementation"] = "ckdtree"
+ else:
+ self.params["implementation"] = "sklearn"
+ if not return_distance:
+ self.params["enable_autodiff"] = False
+
+ def fit_transform(self, X, y=None):
+ return self.fit(X).transform(X)
+
+ def fit(self, X, y=None):
+ """
+ Args:
+ X (numpy.array): coordinates for reference points.
+ """
+ self.ref_points = X
+ if self.params.get("enable_autodiff", False):
+ import eagerpy as ep
+
+ X = ep.astensor(X)
+ if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor):
+ # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch
+ # without copying to/from the CPU.
+ X = X.numpy()
+ if self.params["implementation"] == "ckdtree":
+ # sklearn could handle this, but it is much slower
+ from scipy.spatial import cKDTree
+
+ self.kdtree = cKDTree(X)
+
+ if self.params["implementation"] == "sklearn" and self.metric != "precomputed":
+ # FIXME: sklearn badly handles "precomputed"
+ from sklearn.neighbors import NearestNeighbors
+
+ nargs = {
+ k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}
+ }
+ self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs)
+ self.nn.fit(X)
+
+ if self.params["implementation"] == "hnsw":
+ import hnswlib
+
+ self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances
+ self.graph.init_index(
+ len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}
+ )
+ n = self.params.get("num_threads")
+ if n is None:
+ n = self.params.get("n_jobs", 1)
+ self.params["num_threads"] = n
+ self.graph.add_items(X, num_threads=n)
+
+ return self
+
+ def transform(self, X):
+ """
+ Args:
+ X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed".
+
+ Returns:
+ numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument
+ of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the
+ same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order.
+ """
+ if self.params.get("enable_autodiff", False):
+ # pykeops does not support autodiff for kmin yet, but when it does in the future,
+ # we may want a special path.
+ import eagerpy as ep
+
+ save_return_index = self.return_index
+ self.return_index = True
+ self.return_distance = False
+ self.params["enable_autodiff"] = False
+ try:
+ newX = ep.astensor(X)
+ if self.params["implementation"] != "keops" or (
+ not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor)
+ ):
+ newX = newX.numpy()
+ else:
+ newX = newX.raw
+ neighbors = self.transform(newX)
+ finally:
+ self.return_index = save_return_index
+ self.return_distance = True
+ self.params["enable_autodiff"] = True
+ # We can implement more later as needed
+ assert self.metric == "minkowski"
+ p = self.params["p"]
+ Y = ep.astensor(self.ref_points)
+ neighbor_pts = Y[
+ neighbors,
+ ]
+ diff = neighbor_pts - X[:, None, :]
+ if isinstance(diff, ep.PyTorchTensor):
+ # https://github.com/jonasrauber/eagerpy/issues/6
+ distances = ep.astensor(diff.raw.norm(p, -1))
+ else:
+ distances = diff.norms.lp(p, -1)
+ if self.return_index:
+ return neighbors, distances.raw
+ else:
+ return distances.raw
+
+ metric = self.metric
+ k = self.k
+
+ if metric == "precomputed":
+ # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
+ if self.return_index:
+ n_jobs = self.params.get("n_jobs", 1)
+ # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?!
+ if n_jobs == 1:
+ neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results", True):
+ X = numpy.take_along_axis(X, neighbors, axis=-1)
+ ngb_order = numpy.argsort(X, axis=-1)
+ neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+ else:
+ ngb_order = neighbors
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, ngb_order, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
+ else:
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ slices = gen_even_slices(len(X), effective_n_jobs(n_jobs))
+ parallel = Parallel(prefer="threads", n_jobs=n_jobs)
+ if self.params.get("sort_results", True):
+
+ def func(M):
+ neighbors = numpy.argpartition(M, k - 1)[:, 0:k]
+ Y = numpy.take_along_axis(M, neighbors, axis=-1)
+ ngb_order = numpy.argsort(Y, axis=-1)
+ return numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+
+ else:
+
+ def func(M):
+ return numpy.argpartition(M, k - 1)[:, 0:k]
+
+ neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, neighbors, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
+ if self.return_distance:
+ n_jobs = self.params.get("n_jobs", 1)
+ if n_jobs == 1:
+ distances = numpy.partition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results"):
+ # partition is not guaranteed to sort the lower half, although it often does
+ distances.sort(axis=-1)
+ else:
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ if self.params.get("sort_results"):
+
+ def func(M):
+ # Not partitioning in place, because we should not modify the user's array?
+ r = numpy.partition(M, k - 1)[:, 0:k]
+ r.sort(axis=-1)
+ return r
+
+ else:
+ func = lambda M: numpy.partition(M, k - 1)[:, 0:k]
+ slices = gen_even_slices(len(X), effective_n_jobs(n_jobs))
+ parallel = Parallel(prefer="threads", n_jobs=n_jobs)
+ distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+ return distances
+ return None
+
+ if self.params["implementation"] == "hnsw":
+ ef = self.params.get("ef")
+ if ef is not None:
+ self.graph.set_ef(ef)
+ neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"])
+ # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn,
+ # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue.
+ if self.return_index:
+ if self.return_distance:
+ return neighbors, numpy.sqrt(distances)
+ else:
+ return neighbors
+ if self.return_distance:
+ return numpy.sqrt(distances)
+ return None
+
+ if self.params["implementation"] == "keops":
+ import torch
+ from pykeops.torch import LazyTensor
+
+ # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+ XX = torch.as_tensor(X, dtype=torch.float32)
+ if X is self.ref_points:
+ YY = XX
+ else:
+ YY = torch.as_tensor(self.ref_points, dtype=torch.float32)
+ p = self.params["p"]
+ if p == numpy.inf:
+ # Requires pykeops 1.4 or later
+ mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1)
+ elif p == 2: # Any even integer?
+ mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1)
+ else:
+ mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1)
+
+ if self.return_index:
+ if self.return_distance:
+ distances, neighbors = mat.Kmin_argKmin(k, dim=1)
+ if p != numpy.inf:
+ distances = distances ** (1.0 / p)
+ return neighbors, distances
+ else:
+ neighbors = mat.argKmin(k, dim=1)
+ return neighbors
+ if self.return_distance:
+ distances = mat.Kmin(k, dim=1)
+ if p != numpy.inf:
+ distances = distances ** (1.0 / p)
+ return distances
+ return None
+
+ if self.params["implementation"] == "ckdtree":
+ qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}}
+ distances, neighbors = self.kdtree.query(X, k=self.k, **qargs)
+ if self.return_index:
+ if self.return_distance:
+ return neighbors, distances
+ else:
+ return neighbors
+ if self.return_distance:
+ return distances
+ return None
+
+ assert self.params["implementation"] == "sklearn"
+ if self.return_distance:
+ distances, neighbors = self.nn.kneighbors(X, return_distance=True)
+ if self.return_index:
+ return neighbors, distances
+ else:
+ return distances
+ if self.return_index:
+ neighbors = self.nn.kneighbors(X, return_distance=False)
+ return neighbors
+ return None
diff --git a/src/python/gudhi/point_cloud/timedelay.py b/src/python/gudhi/point_cloud/timedelay.py
new file mode 100644
index 00000000..5292e752
--- /dev/null
+++ b/src/python/gudhi/point_cloud/timedelay.py
@@ -0,0 +1,94 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Martin Royer, Yuichi Ike, Masatoshi Takenouchi
+#
+# Copyright (C) 2020 Inria, Copyright (C) 2020 Fujitsu Laboratories Ltd.
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+import numpy as np
+
+
+class TimeDelayEmbedding:
+ """Point cloud transformation class. Embeds time-series data in the R^d according to
+ `Takens' Embedding Theorem <https://en.wikipedia.org/wiki/Takens%27s_theorem>`_ and obtains the
+ coordinates of each point.
+
+ Parameters
+ ----------
+ dim : int, optional (default=3)
+ `d` of R^d to be embedded.
+ delay : int, optional (default=1)
+ Time-Delay embedding.
+ skip : int, optional (default=1)
+ How often to skip embedded points.
+
+ Example
+ -------
+
+ Given delay=3 and skip=2, a point cloud which is obtained by embedding
+ a scalar time-series into R^3 is as follows::
+
+ time-series = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ point cloud = [[1, 4, 7],
+ [3, 6, 9]]
+
+ Given delay=1 and skip=1, a point cloud which is obtained by embedding
+ a 2D vector time-series data into R^4 is as follows::
+
+ time-series = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]
+ point cloud = [[0, 1, 2, 3],
+ [2, 3, 4, 5],
+ [4, 5, 6, 7],
+ [6, 7, 8, 9]]
+ """
+
+ def __init__(self, dim=3, delay=1, skip=1):
+ self._dim = dim
+ self._delay = delay
+ self._skip = skip
+
+ def __call__(self, ts):
+ """Transform method for single time-series data.
+
+ Parameters
+ ----------
+ ts : Iterable[float] or Iterable[Iterable[float]]
+ A single time-series data, with scalar or vector values.
+
+ Returns
+ -------
+ point cloud : n x dim numpy arrays
+ Makes point cloud from a single time-series data.
+ """
+ return self._transform(np.array(ts))
+
+ def fit(self, ts, y=None):
+ return self
+
+ def _transform(self, ts):
+ """Guts of transform method."""
+ if ts.ndim == 1:
+ repeat = self._dim
+ else:
+ assert self._dim % ts.shape[1] == 0
+ repeat = self._dim // ts.shape[1]
+ end = len(ts) - self._delay * (repeat - 1)
+ short = np.arange(0, end, self._skip)
+ vertical = np.arange(0, repeat * self._delay, self._delay)
+ return ts[np.add.outer(short, vertical)].reshape(len(short), -1)
+
+ def transform(self, ts):
+ """Transform method for multiple time-series data.
+
+ Parameters
+ ----------
+ ts : Iterable[Iterable[float]] or Iterable[Iterable[Iterable[float]]]
+ Multiple time-series data, with scalar or vector values.
+
+ Returns
+ -------
+ point clouds : list of n x dim numpy arrays
+ Makes point cloud from each time-series data.
+ """
+ return [self._transform(np.array(s)) for s in ts]