Merge pull request #265 from mglisse/dtm

DTM
author: Marc Glisse <marc.glisse@inria.fr> 2020-04-20 18:02:20 +0200
committer: GitHub <noreply@github.com> 2020-04-20 18:02:20 +0200
commit: 93cd1240ef65d8883ec624e6e393c09969bf4d6f (patch)
tree: 1b6f5d79350bdcbfb4ceae5fd534ca4e558f4137
parent: 6a397d32ad4e771aab7d8e2da88e4b857258d126 (diff)
parent: 9ef7ba65367ab2ff92bf66b1b8166c5990530b76 (diff)
9 files changed, 680 insertions, 6 deletions
diff --git a/biblio/bibliography.bib b/biblio/bibliography.bib
index a57be224..2bfc28e3 100644
--- a/biblio/bibliography.bib
+++ b/biblio/bibliography.bib
@@ -1208,7 +1208,33 @@ numpages = {11},
 location = {Montr\'{e}al, Canada},
 series = {NIPS’18}
 }
-
+@Article{dtm,
+author={Chazal, Fr{\'e}d{\'e}ric
+and Cohen-Steiner, David
+and M{\'e}rigot, Quentin},
+title={Geometric Inference for Probability Measures},
+journal={Foundations of Computational Mathematics},
+year={2011},
+volume={11},
+number={6},
+pages={733-751},
+abstract={Data often comes in the form of a point cloud sampled from an unknown compact subset of Euclidean space. The general goal of geometric inference is then to recover geometric and topological features (e.g., Betti numbers, normals) of this subset from the approximating point cloud data. It appears that the study of distance functions allows one to address many of these questions successfully. However, one of the main limitations of this framework is that it does not cope well with outliers or with background noise. In this paper, we show how to extend the framework of distance functions to overcome this problem. Replacing compact subsets by measures, we introduce a notion of distance function to a probability distribution in Rd. These functions share many properties with classical distance functions, which make them suitable for inference purposes. In particular, by considering appropriate level sets of these distance functions, we show that it is possible to reconstruct offsets of sampled shapes with topological guarantees even in the presence of outliers. Moreover, in settings where empirical measures are considered, these functions can be easily evaluated, making them of particular practical interest.},
+issn={1615-3383},
+doi={10.1007/s10208-011-9098-0},
+url={https://doi.org/10.1007/s10208-011-9098-0}
+}
+@article{dtmdensity,
+author = "Biau, Gérard and Chazal, Frédéric and Cohen-Steiner, David and Devroye, Luc and Rodríguez, Carlos",
+doi = "10.1214/11-EJS606",
+fjournal = "Electronic Journal of Statistics",
+journal = "Electron. J. Statist.",
+pages = "204--237",
+publisher = "The Institute of Mathematical Statistics and the Bernoulli Society",
+title = "A weighted k-nearest neighbor density estimate for geometric inference",
+url = "https://doi.org/10.1214/11-EJS606",
+volume = "5",
+year = "2011"
+}
 @article{turner2014frechet,
   title={Fr{\'e}chet means for distributions of persistence diagrams},
   author={Turner, Katharine and Mileyko, Yuriy and Mukherjee, Sayan and Harer, John},
diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake
index 2d010483..0abe66b7 100644
--- a/src/cmake/modules/GUDHI_third_party_libraries.cmake
+++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake
@@ -150,6 +150,25 @@ function( find_python_module PYTHON_MODULE_NAME )
   endif()
 endfunction( find_python_module )
 
+# For modules that do not define module.__version__
+function( find_python_module_no_version PYTHON_MODULE_NAME )
+  string(TOUPPER ${PYTHON_MODULE_NAME} PYTHON_MODULE_NAME_UP)
+  execute_process(
+          COMMAND ${PYTHON_EXECUTABLE}  -c "import ${PYTHON_MODULE_NAME}"
+          RESULT_VARIABLE PYTHON_MODULE_RESULT
+          ERROR_VARIABLE PYTHON_MODULE_ERROR)
+  if(PYTHON_MODULE_RESULT EQUAL 0)
+    # Remove carriage return
+    message ("++ Python module ${PYTHON_MODULE_NAME} found")
+    set(${PYTHON_MODULE_NAME_UP}_FOUND TRUE PARENT_SCOPE)
+  else()
+    message ("PYTHON_MODULE_NAME = ${PYTHON_MODULE_NAME}
+     - PYTHON_MODULE_RESULT = ${PYTHON_MODULE_RESULT}
+     - PYTHON_MODULE_ERROR = ${PYTHON_MODULE_ERROR}")
+    set(${PYTHON_MODULE_NAME_UP}_FOUND FALSE PARENT_SCOPE)
+  endif()
+endfunction( find_python_module_no_version )
+
 if( PYTHONINTERP_FOUND )
   find_python_module("cython")
   find_python_module("pytest")
@@ -160,6 +179,10 @@ if( PYTHONINTERP_FOUND )
   find_python_module("sklearn")
   find_python_module("ot")
   find_python_module("pybind11")
+  find_python_module("torch")
+  find_python_module("pykeops")
+  find_python_module("eagerpy")
+  find_python_module_no_version("hnswlib")
 endif()
 
 if(NOT GUDHI_PYTHON_PATH)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index a91ca30a..10dcd161 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -78,6 +78,19 @@ if(PYTHONINTERP_FOUND)
     if(OT_FOUND)
       add_gudhi_debug_info("POT version ${OT_VERSION}")
     endif()
+    if(HNSWLIB_FOUND)
+      # Does not have a version number...
+      add_gudhi_debug_info("HNSWlib found")
+    endif()
+    if(TORCH_FOUND)
+      add_gudhi_debug_info("PyTorch version ${TORCH_VERSION}")
+    endif()
+    if(PYKEOPS_FOUND)
+      add_gudhi_debug_info("PyKeOps version ${PYKEOPS_VERSION}")
+    endif()
+    if(EAGERPY_FOUND)
+      add_gudhi_debug_info("EagerPy version ${EAGERPY_VERSION}")
+    endif()
 
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ")
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ")
@@ -400,6 +413,12 @@ if(PYTHONINTERP_FOUND)
     # Time Delay
     add_gudhi_py_test(test_time_delay)
 
+    # DTM
+    if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND AND EAGERPY_FOUND)
+      add_gudhi_py_test(test_knn)
+      add_gudhi_py_test(test_dtm)
+    endif()
+
     # Documentation generation is available through sphinx - requires all modules
     if(SPHINX_PATH)
       if(MATPLOTLIB_FOUND)
diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst
index c0d4b303..192f70db 100644
--- a/src/python/doc/point_cloud.rst
+++ b/src/python/doc/point_cloud.rst
@@ -21,10 +21,25 @@ Subsampling
    :special-members:
    :show-inheritance:
 
-TimeDelayEmbedding
-------------------
+Time Delay Embedding
+--------------------
 
 .. autoclass:: gudhi.point_cloud.timedelay.TimeDelayEmbedding
    :members:
    :special-members: __call__
 
+K nearest neighbors
+-------------------
+
+.. automodule:: gudhi.point_cloud.knn
+   :members:
+   :undoc-members:
+   :special-members: __init__
+
+Distance to measure
+-------------------
+
+.. automodule:: gudhi.point_cloud.dtm
+   :members:
+   :undoc-members:
+   :special-members: __init__
diff --git a/src/python/doc/point_cloud_sum.inc b/src/python/doc/point_cloud_sum.inc
index 0a159680..d4761aba 100644
--- a/src/python/doc/point_cloud_sum.inc
+++ b/src/python/doc/point_cloud_sum.inc
@@ -2,11 +2,11 @@
    :widths: 30 40 30
 
    +----------------------------------------------------------------+------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-   | | :math:`(x_1, x_2, \ldots, x_d)`                              | Utilities to process point clouds: read from file, subsample, etc.     | :Author: Vincent Rouvreau                                                                                                   |
-   | | :math:`(y_1, y_2, \ldots, y_d)`                              |                                                                        |                                                                                                                             |
+   | | :math:`(x_1, x_2, \ldots, x_d)`                              | Utilities to process point clouds: read from file, subsample,          | :Authors: Vincent Rouvreau, Marc Glisse, Masatoshi Takenouchi                                                               |
+   | | :math:`(y_1, y_2, \ldots, y_d)`                              | find neighbors, embed time series in higher dimension, etc.            |                                                                                                                             |
    |                                                                |                                                                        | :Since: GUDHI 2.0.0                                                                                                         |
    |                                                                |                                                                        |                                                                                                                             |
-   |                                                                |                                                                        | :License: MIT (`GPL v3 </licensing/>`_)                                                                                     |
+   |                                                                |                                                                        | :License: MIT (`GPL v3 </licensing/>`_, BSD-3-Clause, Apache-2.0)                                                           |
    |                                                                | Parts of this package require CGAL.                                    |                                                                                                                             |
    |                                                                |                                                                        | :Requires: `Eigen <installation.html#eigen>`__ :math:`\geq` 3.1.0 and `CGAL <installation.html#cgal>`__ :math:`\geq` 4.11.0 |
    |                                                                |                                                                        |                                                                                                                             |
diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py
new file mode 100644
index 00000000..13e16d24
--- /dev/null
+++ b/src/python/gudhi/point_cloud/dtm.py
@@ -0,0 +1,70 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+from .knn import KNearestNeighbors
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class DistanceToMeasure:
+    """
+    Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`.
+    """
+
+    def __init__(self, k, q=2, **kwargs):
+        """
+        Args:
+            k (int): number of neighbors (possibly including the point itself).
+            q (float): order used to compute the distance to measure. Defaults to 2.
+            kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
+                metric="neighbors" means that :func:`transform` expects an array with the distances
+                to the k nearest neighbors.
+        """
+        self.k = k
+        self.q = q
+        self.params = kwargs
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for mass points.
+        """
+        if self.params.setdefault("metric", "euclidean") != "neighbors":
+            self.knn = KNearestNeighbors(
+                self.k, return_index=False, return_distance=True, sort_results=False, **self.params
+            )
+            self.knn.fit(X)
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
+                or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
+                than k columns, the remaining ones are ignored).
+
+        Returns:
+            numpy.array: a 1-d array with, for each point of X, its distance to the measure defined
+            by the argument of :func:`fit`.
+        """
+        if self.params["metric"] == "neighbors":
+            distances = X[:, : self.k]
+        else:
+            distances = self.knn.transform(X)
+        distances = distances ** self.q
+        dtm = distances.sum(-1) / self.k
+        dtm = dtm ** (1.0 / self.q)
+        # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller.
+        # Add option to skip the final root?
+        return dtm
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
new file mode 100644
index 00000000..4017e498
--- /dev/null
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -0,0 +1,323 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+import numpy
+
+# TODO: https://github.com/facebookresearch/faiss
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class KNearestNeighbors:
+    """
+    Class wrapping several implementations for computing the k nearest neighbors in a point set.
+    """
+
+    def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs):
+        """
+        Args:
+            k (int): number of neighbors (possibly including the point itself).
+            return_index (bool): if True, return the index of each neighbor.
+            return_distance (bool): if True, return the distance to each neighbor.
+            implementation (str): choice of the library that does the real work.
+
+                * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported.
+                * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported.
+                * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute".
+                * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now.
+                * None will try to select a sensible one (scipy if possible, scikit-learn otherwise).
+            metric (str): see `sklearn.neighbors.NearestNeighbors`.
+            eps (float): relative error when computing nearest neighbors with the cKDTree.
+            p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2.
+            n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU.
+                If -1 is given all processors are used. Default: 1.
+            sort_results (bool): if True, then distances and indices of each point are
+                sorted on return, so that the first column contains the closest points.
+                Otherwise, neighbors are returned in an arbitrary order. Defaults to True.
+            enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this
+                instructs the function to compute distances in a way that works with automatic differentiation.
+                This is experimental and not supported for all metrics. Defaults to False.
+            kwargs: additional parameters are forwarded to the backends.
+        """
+        self.k = k
+        self.return_index = return_index
+        self.return_distance = return_distance
+        self.metric = metric
+        self.params = kwargs
+        # canonicalize
+        if metric == "euclidean":
+            self.params["p"] = 2
+            self.metric = "minkowski"
+        elif metric == "manhattan":
+            self.params["p"] = 1
+            self.metric = "minkowski"
+        elif metric == "chebyshev":
+            self.params["p"] = numpy.inf
+            self.metric = "minkowski"
+        elif metric == "minkowski":
+            self.params["p"] = kwargs.get("p", 2)
+        if self.params.get("implementation") in {"keops", "ckdtree"}:
+            assert self.metric == "minkowski"
+        if self.params.get("implementation") == "hnsw":
+            assert self.metric == "minkowski" and self.params["p"] == 2
+        if not self.params.get("implementation"):
+            if self.metric == "minkowski":
+                self.params["implementation"] = "ckdtree"
+            else:
+                self.params["implementation"] = "sklearn"
+        if not return_distance:
+            self.params["enable_autodiff"] = False
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for reference points.
+        """
+        self.ref_points = X
+        if self.params.get("enable_autodiff", False):
+            import eagerpy as ep
+
+            X = ep.astensor(X)
+            if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor):
+                # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch
+                # without copying to/from the CPU.
+                X = X.numpy()
+        if self.params["implementation"] == "ckdtree":
+            # sklearn could handle this, but it is much slower
+            from scipy.spatial import cKDTree
+
+            self.kdtree = cKDTree(X)
+
+        if self.params["implementation"] == "sklearn" and self.metric != "precomputed":
+            # FIXME: sklearn badly handles "precomputed"
+            from sklearn.neighbors import NearestNeighbors
+
+            nargs = {
+                k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}
+            }
+            self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs)
+            self.nn.fit(X)
+
+        if self.params["implementation"] == "hnsw":
+            import hnswlib
+
+            self.graph = hnswlib.Index("l2", len(X[0]))  # Actually returns squared distances
+            self.graph.init_index(
+                len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}
+            )
+            n = self.params.get("num_threads")
+            if n is None:
+                n = self.params.get("n_jobs", 1)
+                self.params["num_threads"] = n
+            self.graph.add_items(X, num_threads=n)
+
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed".
+
+        Returns:
+            numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument
+            of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the
+            same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order.
+        """
+        if self.params.get("enable_autodiff", False):
+            # pykeops does not support autodiff for kmin yet, but when it does in the future,
+            # we may want a special path.
+            import eagerpy as ep
+
+            save_return_index = self.return_index
+            self.return_index = True
+            self.return_distance = False
+            self.params["enable_autodiff"] = False
+            try:
+                newX = ep.astensor(X)
+                if self.params["implementation"] != "keops" or (
+                    not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor)
+                ):
+                    newX = newX.numpy()
+                else:
+                    newX = newX.raw
+                neighbors = self.transform(newX)
+            finally:
+                self.return_index = save_return_index
+                self.return_distance = True
+                self.params["enable_autodiff"] = True
+            # We can implement more later as needed
+            assert self.metric == "minkowski"
+            p = self.params["p"]
+            Y = ep.astensor(self.ref_points)
+            neighbor_pts = Y[
+                neighbors,
+            ]
+            diff = neighbor_pts - X[:, None, :]
+            if isinstance(diff, ep.PyTorchTensor):
+                # https://github.com/jonasrauber/eagerpy/issues/6
+                distances = ep.astensor(diff.raw.norm(p, -1))
+            else:
+                distances = diff.norms.lp(p, -1)
+            if self.return_index:
+                return neighbors, distances.raw
+            else:
+                return distances.raw
+
+        metric = self.metric
+        k = self.k
+
+        if metric == "precomputed":
+            # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
+            if self.return_index:
+                n_jobs = self.params.get("n_jobs", 1)
+                # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?!
+                if n_jobs == 1:
+                    neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+                    if self.params.get("sort_results", True):
+                        X = numpy.take_along_axis(X, neighbors, axis=-1)
+                        ngb_order = numpy.argsort(X, axis=-1)
+                        neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+                    else:
+                        ngb_order = neighbors
+                    if self.return_distance:
+                        distances = numpy.take_along_axis(X, ngb_order, axis=-1)
+                        return neighbors, distances
+                    else:
+                        return neighbors
+                else:
+                    from joblib import Parallel, delayed, effective_n_jobs
+                    from sklearn.utils import gen_even_slices
+
+                    slices = gen_even_slices(len(X), effective_n_jobs(-1))
+                    parallel = Parallel(backend="threading", n_jobs=-1)
+                    if self.params.get("sort_results", True):
+
+                        def func(M):
+                            neighbors = numpy.argpartition(M, k - 1)[:, 0:k]
+                            Y = numpy.take_along_axis(M, neighbors, axis=-1)
+                            ngb_order = numpy.argsort(Y, axis=-1)
+                            return numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+
+                    else:
+
+                        def func(M):
+                            return numpy.argpartition(M, k - 1)[:, 0:k]
+
+                    neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+                    if self.return_distance:
+                        distances = numpy.take_along_axis(X, neighbors, axis=-1)
+                        return neighbors, distances
+                    else:
+                        return neighbors
+            if self.return_distance:
+                n_jobs = self.params.get("n_jobs", 1)
+                if n_jobs == 1:
+                    distances = numpy.partition(X, k - 1)[:, 0:k]
+                    if self.params.get("sort_results"):
+                        # partition is not guaranteed to sort the lower half, although it often does
+                        distances.sort(axis=-1)
+                else:
+                    from joblib import Parallel, delayed, effective_n_jobs
+                    from sklearn.utils import gen_even_slices
+
+                    if self.params.get("sort_results"):
+
+                        def func(M):
+                            # Not partitioning in place, because we should not modify the user's array?
+                            r = numpy.partition(M, k - 1)[:, 0:k]
+                            r.sort(axis=-1)
+                            return r
+
+                    else:
+                        func = lambda M: numpy.partition(M, k - 1)[:, 0:k]
+                    slices = gen_even_slices(len(X), effective_n_jobs(-1))
+                    parallel = Parallel(backend="threading", n_jobs=-1)
+                    distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+                return distances
+            return None
+
+        if self.params["implementation"] == "hnsw":
+            ef = self.params.get("ef")
+            if ef is not None:
+                self.graph.set_ef(ef)
+            neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"])
+            # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn,
+            # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue.
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, numpy.sqrt(distances)
+                else:
+                    return neighbors
+            if self.return_distance:
+                return numpy.sqrt(distances)
+            return None
+
+        if self.params["implementation"] == "keops":
+            import torch
+            from pykeops.torch import LazyTensor
+
+            # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+            XX = torch.as_tensor(X, dtype=torch.float32)
+            if X is self.ref_points:
+                YY = XX
+            else:
+                YY = torch.as_tensor(self.ref_points, dtype=torch.float32)
+            p = self.params["p"]
+            if p == numpy.inf:
+                # Requires pykeops 1.4 or later
+                mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1)
+            elif p == 2:  # Any even integer?
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1)
+            else:
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1)
+
+            if self.return_index:
+                if self.return_distance:
+                    distances, neighbors = mat.Kmin_argKmin(k, dim=1)
+                    if p != numpy.inf:
+                        distances = distances ** (1.0 / p)
+                    return neighbors, distances
+                else:
+                    neighbors = mat.argKmin(k, dim=1)
+                    return neighbors
+            if self.return_distance:
+                distances = mat.Kmin(k, dim=1)
+                if p != numpy.inf:
+                    distances = distances ** (1.0 / p)
+                return distances
+            return None
+
+        if self.params["implementation"] == "ckdtree":
+            qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}}
+            distances, neighbors = self.kdtree.query(X, k=self.k, **qargs)
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, distances
+                else:
+                    return neighbors
+            if self.return_distance:
+                return distances
+            return None
+
+        assert self.params["implementation"] == "sklearn"
+        if self.return_distance:
+            distances, neighbors = self.nn.kneighbors(X, return_distance=True)
+            if self.return_index:
+                return neighbors, distances
+            else:
+                return distances
+        if self.return_index:
+            neighbors = self.nn.kneighbors(X, return_distance=False)
+            return neighbors
+        return None
diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py
new file mode 100755
index 00000000..859189fa
--- /dev/null
+++ b/src/python/test/test_dtm.py
@@ -0,0 +1,68 @@
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+    Author(s):       Marc Glisse
+
+    Copyright (C) 2020 Inria
+
+    Modification(s):
+      - YYYY/MM Author: Description of the modification
+"""
+
+from gudhi.point_cloud.dtm import DistanceToMeasure
+import numpy
+import pytest
+import torch
+
+
+def test_dtm_compare_euclidean():
+    pts = numpy.random.rand(1000, 4)
+    k = 3
+    dtm = DistanceToMeasure(k, implementation="ckdtree")
+    r0 = dtm.fit_transform(pts)
+    dtm = DistanceToMeasure(k, implementation="sklearn")
+    r1 = dtm.fit_transform(pts)
+    assert r1 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="sklearn", algorithm="brute")
+    r2 = dtm.fit_transform(pts)
+    assert r2 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="hnsw")
+    r3 = dtm.fit_transform(pts)
+    assert r3 == pytest.approx(r0)
+    from scipy.spatial.distance import cdist
+
+    d = cdist(pts, pts)
+    dtm = DistanceToMeasure(k, metric="precomputed")
+    r4 = dtm.fit_transform(d)
+    assert r4 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, metric="precomputed", n_jobs=2)
+    r4b = dtm.fit_transform(d)
+    assert r4b == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="keops")
+    r5 = dtm.fit_transform(pts)
+    assert r5 == pytest.approx(r0)
+    pts2 = torch.tensor(pts, requires_grad=True)
+    assert pts2.grad is None
+    dtm = DistanceToMeasure(k, implementation="keops", enable_autodiff=True)
+    r6 = dtm.fit_transform(pts2)
+    assert r6.detach().numpy() == pytest.approx(r0)
+    r6.sum().backward()
+    assert not torch.isnan(pts2.grad).any()
+    pts2 = torch.tensor(pts, requires_grad=True)
+    assert pts2.grad is None
+    dtm = DistanceToMeasure(k, implementation="ckdtree", enable_autodiff=True)
+    r7 = dtm.fit_transform(pts2)
+    assert r7.detach().numpy() == pytest.approx(r0)
+    r7.sum().backward()
+    assert not torch.isnan(pts2.grad).any()
+
+
+def test_dtm_precomputed():
+    dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]])
+    dtm = DistanceToMeasure(2, q=1, metric="neighbors")
+    r = dtm.fit_transform(dist)
+    assert r == pytest.approx([2.0, 3, 1])
+
+    dist = numpy.array([[2.0, 2], [0, 1], [3, 4]])
+    dtm = DistanceToMeasure(2, q=2, metric="neighbors")
+    r = dtm.fit_transform(dist)
+    assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01)
diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py
new file mode 100755
index 00000000..a87ec212
--- /dev/null
+++ b/src/python/test/test_knn.py
@@ -0,0 +1,130 @@
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+    Author(s):       Marc Glisse
+
+    Copyright (C) 2020 Inria
+
+    Modification(s):
+      - YYYY/MM Author: Description of the modification
+"""
+
+from gudhi.point_cloud.knn import KNearestNeighbors
+import numpy as np
+import pytest
+
+
+def test_knn_explicit():
+    base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]])
+    query = np.array([[1.0, 1], [2, 2], [4, 4]])
+    knn = KNearestNeighbors(2, metric="manhattan", return_distance=True, return_index=True)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert r[0] == pytest.approx(np.array([[0, 1], [1, 0], [3, 2]]))
+    assert r[1] == pytest.approx(np.array([[0.0, 1], [1, 2], [1, 2]]))
+
+    knn = KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+    r = (
+        KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops")
+        .fit(base)
+        .transform(query)
+    )
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+    r = (
+        KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops", enable_autodiff=True)
+        .fit(base)
+        .transform(query)
+    )
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+
+    knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]])
+    r = (
+        KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops")
+        .fit(base)
+        .transform(query)
+    )
+    assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]])
+
+    dist = np.array([[0.0, 3, 8], [1, 0, 5], [1, 2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, sort_results=True)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]])
+    assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]])
+    # Second time in parallel
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2, sort_results=True)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, n_jobs=2)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]])
+    assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]])
+
+
+def test_knn_compare():
+    base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]])
+    query = np.array([[1.0, 1], [2, 2], [4, 4]])
+    r0 = (
+        KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    r1 = (
+        KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    r2 = (
+        KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query)
+    )
+    r3 = (
+        KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    assert np.array_equal(r0, r1) and np.array_equal(r0, r2) and np.array_equal(r0, r3)
+
+    r0 = (
+        KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=True)
+        .fit(base)
+        .transform(query)
+    )
+    r1 = (
+        KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=True)
+        .fit(base)
+        .transform(query)
+    )
+    r2 = KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query)
+    r3 = (
+        KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query)
+    )
+    assert np.array_equal(r0[0], r1[0]) and np.array_equal(r0[0], r2[0]) and np.array_equal(r0[0], r3[0])
+    d0 = pytest.approx(r0[1])
+    assert r1[1] == d0 and r2[1] == d0 and r3[1] == d0
+
+
+def test_knn_nop():
+    # This doesn't look super useful...
+    p = np.array([[0.0]])
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="sklearn"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="ckdtree"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="keops"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, metric="precomputed"
+    ).fit_transform(p)
author	Marc Glisse <marc.glisse@inria.fr>	2020-04-20 18:02:20 +0200
committer	GitHub <noreply@github.com>	2020-04-20 18:02:20 +0200
commit	93cd1240ef65d8883ec624e6e393c09969bf4d6f (patch)
tree	1b6f5d79350bdcbfb4ceae5fd534ca4e558f4137
parent	6a397d32ad4e771aab7d8e2da88e4b857258d126 (diff)
parent	9ef7ba65367ab2ff92bf66b1b8166c5990530b76 (diff)