diff options
Diffstat (limited to 'src/python/gudhi/point_cloud')
-rw-r--r-- | src/python/gudhi/point_cloud/__init__.py | 0 | ||||
-rw-r--r-- | src/python/gudhi/point_cloud/dtm.py | 70 | ||||
-rw-r--r-- | src/python/gudhi/point_cloud/knn.py | 328 | ||||
-rw-r--r-- | src/python/gudhi/point_cloud/timedelay.py | 94 |
4 files changed, 492 insertions, 0 deletions
diff --git a/src/python/gudhi/point_cloud/__init__.py b/src/python/gudhi/point_cloud/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/python/gudhi/point_cloud/__init__.py diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py new file mode 100644 index 00000000..13e16d24 --- /dev/null +++ b/src/python/gudhi/point_cloud/dtm.py @@ -0,0 +1,70 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +from .knn import KNearestNeighbors + +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + + +class DistanceToMeasure: + """ + Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`. + """ + + def __init__(self, k, q=2, **kwargs): + """ + Args: + k (int): number of neighbors (possibly including the point itself). + q (float): order used to compute the distance to measure. Defaults to 2. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that + metric="neighbors" means that :func:`transform` expects an array with the distances + to the k nearest neighbors. + """ + self.k = k + self.q = q + self.params = kwargs + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for mass points. + """ + if self.params.setdefault("metric", "euclidean") != "neighbors": + self.knn = KNearestNeighbors( + self.k, return_index=False, return_distance=True, sort_results=False, **self.params + ) + self.knn.fit(X) + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", + or distances to the k nearest neighbors if metric is "neighbors" (if the array has more + than k columns, the remaining ones are ignored). + + Returns: + numpy.array: a 1-d array with, for each point of X, its distance to the measure defined + by the argument of :func:`fit`. + """ + if self.params["metric"] == "neighbors": + distances = X[:, : self.k] + else: + distances = self.knn.transform(X) + distances = distances ** self.q + dtm = distances.sum(-1) / self.k + dtm = dtm ** (1.0 / self.q) + # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller. + # Add option to skip the final root? + return dtm diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py new file mode 100644 index 00000000..86008bc3 --- /dev/null +++ b/src/python/gudhi/point_cloud/knn.py @@ -0,0 +1,328 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +import numpy + +# TODO: https://github.com/facebookresearch/faiss + +__author__ = "Marc Glisse" +__copyright__ = "Copyright (C) 2020 Inria" +__license__ = "MIT" + + +class KNearestNeighbors: + """ + Class wrapping several implementations for computing the k nearest neighbors in a point set. + + :Requires: `PyKeOps <installation.html#pykeops>`_, `SciPy <installation.html#scipy>`_, + `Scikit-learn <installation.html#scikit-learn>`_, and/or `Hnswlib <installation.html#hnswlib>`_ + in function of the selected `implementation`. + """ + + def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs): + """ + Args: + k (int): number of neighbors (possibly including the point itself). + return_index (bool): if True, return the index of each neighbor. + return_distance (bool): if True, return the distance to each neighbor. + implementation (str): choice of the library that does the real work. + + * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported. + * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported. + * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute". + * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now. + * None will try to select a sensible one (scipy if possible, scikit-learn otherwise). + metric (str): see `sklearn.neighbors.NearestNeighbors`. + eps (float): relative error when computing nearest neighbors with the cKDTree. + p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2. + n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU. + If -1 is given all processors are used. Default: 1. + sort_results (bool): if True, then distances and indices of each point are + sorted on return, so that the first column contains the closest points. + Otherwise, neighbors are returned in an arbitrary order. Defaults to True. + enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this + instructs the function to compute distances in a way that works with automatic differentiation. + This is experimental, not supported for all metrics, and requires the package EagerPy. + Defaults to False. + kwargs: additional parameters are forwarded to the backends. + """ + self.k = k + self.return_index = return_index + self.return_distance = return_distance + self.metric = metric + self.params = kwargs + # canonicalize + if metric == "euclidean": + self.params["p"] = 2 + self.metric = "minkowski" + elif metric == "manhattan": + self.params["p"] = 1 + self.metric = "minkowski" + elif metric == "chebyshev": + self.params["p"] = numpy.inf + self.metric = "minkowski" + elif metric == "minkowski": + self.params["p"] = kwargs.get("p", 2) + if self.params.get("implementation") in {"keops", "ckdtree"}: + assert self.metric == "minkowski" + if self.params.get("implementation") == "hnsw": + assert self.metric == "minkowski" and self.params["p"] == 2 + if not self.params.get("implementation"): + if self.metric == "minkowski": + self.params["implementation"] = "ckdtree" + else: + self.params["implementation"] = "sklearn" + if not return_distance: + self.params["enable_autodiff"] = False + + def fit_transform(self, X, y=None): + return self.fit(X).transform(X) + + def fit(self, X, y=None): + """ + Args: + X (numpy.array): coordinates for reference points. + """ + self.ref_points = X + if self.params.get("enable_autodiff", False): + import eagerpy as ep + + X = ep.astensor(X) + if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor): + # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch + # without copying to/from the CPU. + X = X.numpy() + if self.params["implementation"] == "ckdtree": + # sklearn could handle this, but it is much slower + from scipy.spatial import cKDTree + + self.kdtree = cKDTree(X) + + if self.params["implementation"] == "sklearn" and self.metric != "precomputed": + # FIXME: sklearn badly handles "precomputed" + from sklearn.neighbors import NearestNeighbors + + nargs = { + k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"} + } + self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs) + self.nn.fit(X) + + if self.params["implementation"] == "hnsw": + import hnswlib + + self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances + self.graph.init_index( + len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}} + ) + n = self.params.get("num_threads") + if n is None: + n = self.params.get("n_jobs", 1) + self.params["num_threads"] = n + self.graph.add_items(X, num_threads=n) + + return self + + def transform(self, X): + """ + Args: + X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed". + + Returns: + numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument + of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the + same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order. + """ + if self.params.get("enable_autodiff", False): + # pykeops does not support autodiff for kmin yet, but when it does in the future, + # we may want a special path. + import eagerpy as ep + + save_return_index = self.return_index + self.return_index = True + self.return_distance = False + self.params["enable_autodiff"] = False + try: + newX = ep.astensor(X) + if self.params["implementation"] != "keops" or ( + not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor) + ): + newX = newX.numpy() + else: + newX = newX.raw + neighbors = self.transform(newX) + finally: + self.return_index = save_return_index + self.return_distance = True + self.params["enable_autodiff"] = True + # We can implement more later as needed + assert self.metric == "minkowski" + p = self.params["p"] + Y = ep.astensor(self.ref_points) + neighbor_pts = Y[ + neighbors, + ] + diff = neighbor_pts - X[:, None, :] + if isinstance(diff, ep.PyTorchTensor): + # https://github.com/jonasrauber/eagerpy/issues/6 + distances = ep.astensor(diff.raw.norm(p, -1)) + else: + distances = diff.norms.lp(p, -1) + if self.return_index: + return neighbors, distances.raw + else: + return distances.raw + + metric = self.metric + k = self.k + + if metric == "precomputed": + # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural. + if self.return_index: + n_jobs = self.params.get("n_jobs", 1) + # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?! + if n_jobs == 1: + neighbors = numpy.argpartition(X, k - 1)[:, 0:k] + if self.params.get("sort_results", True): + X = numpy.take_along_axis(X, neighbors, axis=-1) + ngb_order = numpy.argsort(X, axis=-1) + neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1) + else: + ngb_order = neighbors + if self.return_distance: + distances = numpy.take_along_axis(X, ngb_order, axis=-1) + return neighbors, distances + else: + return neighbors + else: + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + slices = gen_even_slices(len(X), effective_n_jobs(n_jobs)) + parallel = Parallel(prefer="threads", n_jobs=n_jobs) + if self.params.get("sort_results", True): + + def func(M): + neighbors = numpy.argpartition(M, k - 1)[:, 0:k] + Y = numpy.take_along_axis(M, neighbors, axis=-1) + ngb_order = numpy.argsort(Y, axis=-1) + return numpy.take_along_axis(neighbors, ngb_order, axis=-1) + + else: + + def func(M): + return numpy.argpartition(M, k - 1)[:, 0:k] + + neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) + if self.return_distance: + distances = numpy.take_along_axis(X, neighbors, axis=-1) + return neighbors, distances + else: + return neighbors + if self.return_distance: + n_jobs = self.params.get("n_jobs", 1) + if n_jobs == 1: + distances = numpy.partition(X, k - 1)[:, 0:k] + if self.params.get("sort_results"): + # partition is not guaranteed to sort the lower half, although it often does + distances.sort(axis=-1) + else: + from joblib import Parallel, delayed, effective_n_jobs + from sklearn.utils import gen_even_slices + + if self.params.get("sort_results"): + + def func(M): + # Not partitioning in place, because we should not modify the user's array? + r = numpy.partition(M, k - 1)[:, 0:k] + r.sort(axis=-1) + return r + + else: + func = lambda M: numpy.partition(M, k - 1)[:, 0:k] + slices = gen_even_slices(len(X), effective_n_jobs(n_jobs)) + parallel = Parallel(prefer="threads", n_jobs=n_jobs) + distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices)) + return distances + return None + + if self.params["implementation"] == "hnsw": + ef = self.params.get("ef") + if ef is not None: + self.graph.set_ef(ef) + neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"]) + # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn, + # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue. + if self.return_index: + if self.return_distance: + return neighbors, numpy.sqrt(distances) + else: + return neighbors + if self.return_distance: + return numpy.sqrt(distances) + return None + + if self.params["implementation"] == "keops": + import torch + from pykeops.torch import LazyTensor + + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.as_tensor(X, dtype=torch.float32) + if X is self.ref_points: + YY = XX + else: + YY = torch.as_tensor(self.ref_points, dtype=torch.float32) + p = self.params["p"] + if p == numpy.inf: + # Requires pykeops 1.4 or later + mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1) + elif p == 2: # Any even integer? + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1) + else: + mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1) + + if self.return_index: + if self.return_distance: + distances, neighbors = mat.Kmin_argKmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return neighbors, distances + else: + neighbors = mat.argKmin(k, dim=1) + return neighbors + if self.return_distance: + distances = mat.Kmin(k, dim=1) + if p != numpy.inf: + distances = distances ** (1.0 / p) + return distances + return None + + if self.params["implementation"] == "ckdtree": + qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}} + distances, neighbors = self.kdtree.query(X, k=self.k, **qargs) + if self.return_index: + if self.return_distance: + return neighbors, distances + else: + return neighbors + if self.return_distance: + return distances + return None + + assert self.params["implementation"] == "sklearn" + if self.return_distance: + distances, neighbors = self.nn.kneighbors(X, return_distance=True) + if self.return_index: + return neighbors, distances + else: + return distances + if self.return_index: + neighbors = self.nn.kneighbors(X, return_distance=False) + return neighbors + return None diff --git a/src/python/gudhi/point_cloud/timedelay.py b/src/python/gudhi/point_cloud/timedelay.py new file mode 100644 index 00000000..5292e752 --- /dev/null +++ b/src/python/gudhi/point_cloud/timedelay.py @@ -0,0 +1,94 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Martin Royer, Yuichi Ike, Masatoshi Takenouchi +# +# Copyright (C) 2020 Inria, Copyright (C) 2020 Fujitsu Laboratories Ltd. +# Modification(s): +# - YYYY/MM Author: Description of the modification + +import numpy as np + + +class TimeDelayEmbedding: + """Point cloud transformation class. Embeds time-series data in the R^d according to + `Takens' Embedding Theorem <https://en.wikipedia.org/wiki/Takens%27s_theorem>`_ and obtains the + coordinates of each point. + + Parameters + ---------- + dim : int, optional (default=3) + `d` of R^d to be embedded. + delay : int, optional (default=1) + Time-Delay embedding. + skip : int, optional (default=1) + How often to skip embedded points. + + Example + ------- + + Given delay=3 and skip=2, a point cloud which is obtained by embedding + a scalar time-series into R^3 is as follows:: + + time-series = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + point cloud = [[1, 4, 7], + [3, 6, 9]] + + Given delay=1 and skip=1, a point cloud which is obtained by embedding + a 2D vector time-series data into R^4 is as follows:: + + time-series = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]] + point cloud = [[0, 1, 2, 3], + [2, 3, 4, 5], + [4, 5, 6, 7], + [6, 7, 8, 9]] + """ + + def __init__(self, dim=3, delay=1, skip=1): + self._dim = dim + self._delay = delay + self._skip = skip + + def __call__(self, ts): + """Transform method for single time-series data. + + Parameters + ---------- + ts : Iterable[float] or Iterable[Iterable[float]] + A single time-series data, with scalar or vector values. + + Returns + ------- + point cloud : n x dim numpy arrays + Makes point cloud from a single time-series data. + """ + return self._transform(np.array(ts)) + + def fit(self, ts, y=None): + return self + + def _transform(self, ts): + """Guts of transform method.""" + if ts.ndim == 1: + repeat = self._dim + else: + assert self._dim % ts.shape[1] == 0 + repeat = self._dim // ts.shape[1] + end = len(ts) - self._delay * (repeat - 1) + short = np.arange(0, end, self._skip) + vertical = np.arange(0, repeat * self._delay, self._delay) + return ts[np.add.outer(short, vertical)].reshape(len(short), -1) + + def transform(self, ts): + """Transform method for multiple time-series data. + + Parameters + ---------- + ts : Iterable[Iterable[float]] or Iterable[Iterable[Iterable[float]]] + Multiple time-series data, with scalar or vector values. + + Returns + ------- + point clouds : list of n x dim numpy arrays + Makes point cloud from each time-series data. + """ + return [self._transform(np.array(s)) for s in ts] |