Merge remote-tracking branch 'origin/master' into wass-autodiff

author: Marc Glisse <marc.glisse@inria.fr> 2020-04-20 18:41:59 +0200
committer: Marc Glisse <marc.glisse@inria.fr> 2020-04-20 18:41:59 +0200
commit: 0393fdd3da2b5e403757c0f3418919c81ccbdd76 (patch)
tree: 008b752c6069b165efee50cb928adc8267343101 /src
parent: 1086b8cad7c1ea2a02742dfc44aef036a674f5d3 (diff)
parent: 93cd1240ef65d8883ec624e6e393c09969bf4d6f (diff)
10 files changed, 671 insertions, 5 deletions
diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake
index 2d010483..0abe66b7 100644
--- a/src/cmake/modules/GUDHI_third_party_libraries.cmake
+++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake
@@ -150,6 +150,25 @@ function( find_python_module PYTHON_MODULE_NAME )
   endif()
 endfunction( find_python_module )
 
+# For modules that do not define module.__version__
+function( find_python_module_no_version PYTHON_MODULE_NAME )
+  string(TOUPPER ${PYTHON_MODULE_NAME} PYTHON_MODULE_NAME_UP)
+  execute_process(
+          COMMAND ${PYTHON_EXECUTABLE}  -c "import ${PYTHON_MODULE_NAME}"
+          RESULT_VARIABLE PYTHON_MODULE_RESULT
+          ERROR_VARIABLE PYTHON_MODULE_ERROR)
+  if(PYTHON_MODULE_RESULT EQUAL 0)
+    # Remove carriage return
+    message ("++ Python module ${PYTHON_MODULE_NAME} found")
+    set(${PYTHON_MODULE_NAME_UP}_FOUND TRUE PARENT_SCOPE)
+  else()
+    message ("PYTHON_MODULE_NAME = ${PYTHON_MODULE_NAME}
+     - PYTHON_MODULE_RESULT = ${PYTHON_MODULE_RESULT}
+     - PYTHON_MODULE_ERROR = ${PYTHON_MODULE_ERROR}")
+    set(${PYTHON_MODULE_NAME_UP}_FOUND FALSE PARENT_SCOPE)
+  endif()
+endfunction( find_python_module_no_version )
+
 if( PYTHONINTERP_FOUND )
   find_python_module("cython")
   find_python_module("pytest")
@@ -160,6 +179,10 @@ if( PYTHONINTERP_FOUND )
   find_python_module("sklearn")
   find_python_module("ot")
   find_python_module("pybind11")
+  find_python_module("torch")
+  find_python_module("pykeops")
+  find_python_module("eagerpy")
+  find_python_module_no_version("hnswlib")
 endif()
 
 if(NOT GUDHI_PYTHON_PATH)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index a91ca30a..10dcd161 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -78,6 +78,19 @@ if(PYTHONINTERP_FOUND)
     if(OT_FOUND)
       add_gudhi_debug_info("POT version ${OT_VERSION}")
     endif()
+    if(HNSWLIB_FOUND)
+      # Does not have a version number...
+      add_gudhi_debug_info("HNSWlib found")
+    endif()
+    if(TORCH_FOUND)
+      add_gudhi_debug_info("PyTorch version ${TORCH_VERSION}")
+    endif()
+    if(PYKEOPS_FOUND)
+      add_gudhi_debug_info("PyKeOps version ${PYKEOPS_VERSION}")
+    endif()
+    if(EAGERPY_FOUND)
+      add_gudhi_debug_info("EagerPy version ${EAGERPY_VERSION}")
+    endif()
 
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ")
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ")
@@ -400,6 +413,12 @@ if(PYTHONINTERP_FOUND)
     # Time Delay
     add_gudhi_py_test(test_time_delay)
 
+    # DTM
+    if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND AND EAGERPY_FOUND)
+      add_gudhi_py_test(test_knn)
+      add_gudhi_py_test(test_dtm)
+    endif()
+
     # Documentation generation is available through sphinx - requires all modules
     if(SPHINX_PATH)
       if(MATPLOTLIB_FOUND)
diff --git a/src/python/doc/point_cloud.rst b/src/python/doc/point_cloud.rst
index c0d4b303..192f70db 100644
--- a/src/python/doc/point_cloud.rst
+++ b/src/python/doc/point_cloud.rst
@@ -21,10 +21,25 @@ Subsampling
    :special-members:
    :show-inheritance:
 
-TimeDelayEmbedding
-------------------
+Time Delay Embedding
+--------------------
 
 .. autoclass:: gudhi.point_cloud.timedelay.TimeDelayEmbedding
    :members:
    :special-members: __call__
 
+K nearest neighbors
+-------------------
+
+.. automodule:: gudhi.point_cloud.knn
+   :members:
+   :undoc-members:
+   :special-members: __init__
+
+Distance to measure
+-------------------
+
+.. automodule:: gudhi.point_cloud.dtm
+   :members:
+   :undoc-members:
+   :special-members: __init__
diff --git a/src/python/doc/point_cloud_sum.inc b/src/python/doc/point_cloud_sum.inc
index 0a159680..d4761aba 100644
--- a/src/python/doc/point_cloud_sum.inc
+++ b/src/python/doc/point_cloud_sum.inc
@@ -2,11 +2,11 @@
    :widths: 30 40 30
 
    +----------------------------------------------------------------+------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------+
-   | | :math:`(x_1, x_2, \ldots, x_d)`                              | Utilities to process point clouds: read from file, subsample, etc.     | :Author: Vincent Rouvreau                                                                                                   |
-   | | :math:`(y_1, y_2, \ldots, y_d)`                              |                                                                        |                                                                                                                             |
+   | | :math:`(x_1, x_2, \ldots, x_d)`                              | Utilities to process point clouds: read from file, subsample,          | :Authors: Vincent Rouvreau, Marc Glisse, Masatoshi Takenouchi                                                               |
+   | | :math:`(y_1, y_2, \ldots, y_d)`                              | find neighbors, embed time series in higher dimension, etc.            |                                                                                                                             |
    |                                                                |                                                                        | :Since: GUDHI 2.0.0                                                                                                         |
    |                                                                |                                                                        |                                                                                                                             |
-   |                                                                |                                                                        | :License: MIT (`GPL v3 </licensing/>`_)                                                                                     |
+   |                                                                |                                                                        | :License: MIT (`GPL v3 </licensing/>`_, BSD-3-Clause, Apache-2.0)                                                           |
    |                                                                | Parts of this package require CGAL.                                    |                                                                                                                             |
    |                                                                |                                                                        | :Requires: `Eigen <installation.html#eigen>`__ :math:`\geq` 3.1.0 and `CGAL <installation.html#cgal>`__ :math:`\geq` 4.11.0 |
    |                                                                |                                                                        |                                                                                                                             |
diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py
new file mode 100644
index 00000000..13e16d24
--- /dev/null
+++ b/src/python/gudhi/point_cloud/dtm.py
@@ -0,0 +1,70 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+from .knn import KNearestNeighbors
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class DistanceToMeasure:
+    """
+    Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`.
+    """
+
+    def __init__(self, k, q=2, **kwargs):
+        """
+        Args:
+            k (int): number of neighbors (possibly including the point itself).
+            q (float): order used to compute the distance to measure. Defaults to 2.
+            kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
+                metric="neighbors" means that :func:`transform` expects an array with the distances
+                to the k nearest neighbors.
+        """
+        self.k = k
+        self.q = q
+        self.params = kwargs
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for mass points.
+        """
+        if self.params.setdefault("metric", "euclidean") != "neighbors":
+            self.knn = KNearestNeighbors(
+                self.k, return_index=False, return_distance=True, sort_results=False, **self.params
+            )
+            self.knn.fit(X)
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
+                or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
+                than k columns, the remaining ones are ignored).
+
+        Returns:
+            numpy.array: a 1-d array with, for each point of X, its distance to the measure defined
+            by the argument of :func:`fit`.
+        """
+        if self.params["metric"] == "neighbors":
+            distances = X[:, : self.k]
+        else:
+            distances = self.knn.transform(X)
+        distances = distances ** self.q
+        dtm = distances.sum(-1) / self.k
+        dtm = dtm ** (1.0 / self.q)
+        # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller.
+        # Add option to skip the final root?
+        return dtm
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
new file mode 100644
index 00000000..4017e498
--- /dev/null
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -0,0 +1,323 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Marc Glisse
+#
+# Copyright (C) 2020 Inria
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+import numpy
+
+# TODO: https://github.com/facebookresearch/faiss
+
+__author__ = "Marc Glisse"
+__copyright__ = "Copyright (C) 2020 Inria"
+__license__ = "MIT"
+
+
+class KNearestNeighbors:
+    """
+    Class wrapping several implementations for computing the k nearest neighbors in a point set.
+    """
+
+    def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs):
+        """
+        Args:
+            k (int): number of neighbors (possibly including the point itself).
+            return_index (bool): if True, return the index of each neighbor.
+            return_distance (bool): if True, return the distance to each neighbor.
+            implementation (str): choice of the library that does the real work.
+
+                * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes large (10+) but the number of points remains low (less than a million). Only "minkowski" and its aliases are supported.
+                * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported.
+                * 'sklearn' for scikit-learn's NearestNeighbors. Note that this provides in particular an option algorithm="brute".
+                * 'hnsw' for hnswlib.Index. It can be very fast but does not provide guarantees. Only supports "euclidean" for now.
+                * None will try to select a sensible one (scipy if possible, scikit-learn otherwise).
+            metric (str): see `sklearn.neighbors.NearestNeighbors`.
+            eps (float): relative error when computing nearest neighbors with the cKDTree.
+            p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2.
+            n_jobs (int): number of jobs to schedule for parallel processing of nearest neighbors on the CPU.
+                If -1 is given all processors are used. Default: 1.
+            sort_results (bool): if True, then distances and indices of each point are
+                sorted on return, so that the first column contains the closest points.
+                Otherwise, neighbors are returned in an arbitrary order. Defaults to True.
+            enable_autodiff (bool): if the input is a torch.tensor, jax.numpy.ndarray or tensorflow.Tensor, this
+                instructs the function to compute distances in a way that works with automatic differentiation.
+                This is experimental and not supported for all metrics. Defaults to False.
+            kwargs: additional parameters are forwarded to the backends.
+        """
+        self.k = k
+        self.return_index = return_index
+        self.return_distance = return_distance
+        self.metric = metric
+        self.params = kwargs
+        # canonicalize
+        if metric == "euclidean":
+            self.params["p"] = 2
+            self.metric = "minkowski"
+        elif metric == "manhattan":
+            self.params["p"] = 1
+            self.metric = "minkowski"
+        elif metric == "chebyshev":
+            self.params["p"] = numpy.inf
+            self.metric = "minkowski"
+        elif metric == "minkowski":
+            self.params["p"] = kwargs.get("p", 2)
+        if self.params.get("implementation") in {"keops", "ckdtree"}:
+            assert self.metric == "minkowski"
+        if self.params.get("implementation") == "hnsw":
+            assert self.metric == "minkowski" and self.params["p"] == 2
+        if not self.params.get("implementation"):
+            if self.metric == "minkowski":
+                self.params["implementation"] = "ckdtree"
+            else:
+                self.params["implementation"] = "sklearn"
+        if not return_distance:
+            self.params["enable_autodiff"] = False
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for reference points.
+        """
+        self.ref_points = X
+        if self.params.get("enable_autodiff", False):
+            import eagerpy as ep
+
+            X = ep.astensor(X)
+            if self.params["implementation"] != "keops" or not isinstance(X, ep.PyTorchTensor):
+                # I don't know a clever way to reuse a GPU tensor from tensorflow in pytorch
+                # without copying to/from the CPU.
+                X = X.numpy()
+        if self.params["implementation"] == "ckdtree":
+            # sklearn could handle this, but it is much slower
+            from scipy.spatial import cKDTree
+
+            self.kdtree = cKDTree(X)
+
+        if self.params["implementation"] == "sklearn" and self.metric != "precomputed":
+            # FIXME: sklearn badly handles "precomputed"
+            from sklearn.neighbors import NearestNeighbors
+
+            nargs = {
+                k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}
+            }
+            self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs)
+            self.nn.fit(X)
+
+        if self.params["implementation"] == "hnsw":
+            import hnswlib
+
+            self.graph = hnswlib.Index("l2", len(X[0]))  # Actually returns squared distances
+            self.graph.init_index(
+                len(X), **{k: v for k, v in self.params.items() if k in {"ef_construction", "M", "random_seed"}}
+            )
+            n = self.params.get("num_threads")
+            if n is None:
+                n = self.params.get("n_jobs", 1)
+                self.params["num_threads"] = n
+            self.graph.add_items(X, num_threads=n)
+
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed".
+
+        Returns:
+            numpy.array: if return_index, an array of shape (len(X), k) with the indices (in the argument
+            of :func:`fit`) of the k nearest neighbors to the points of X. If return_distance, an array of the
+            same shape with the distances to those neighbors. If both, a tuple with the two arrays, in this order.
+        """
+        if self.params.get("enable_autodiff", False):
+            # pykeops does not support autodiff for kmin yet, but when it does in the future,
+            # we may want a special path.
+            import eagerpy as ep
+
+            save_return_index = self.return_index
+            self.return_index = True
+            self.return_distance = False
+            self.params["enable_autodiff"] = False
+            try:
+                newX = ep.astensor(X)
+                if self.params["implementation"] != "keops" or (
+                    not isinstance(newX, ep.PyTorchTensor) and not isinstance(newX, ep.NumPyTensor)
+                ):
+                    newX = newX.numpy()
+                else:
+                    newX = newX.raw
+                neighbors = self.transform(newX)
+            finally:
+                self.return_index = save_return_index
+                self.return_distance = True
+                self.params["enable_autodiff"] = True
+            # We can implement more later as needed
+            assert self.metric == "minkowski"
+            p = self.params["p"]
+            Y = ep.astensor(self.ref_points)
+            neighbor_pts = Y[
+                neighbors,
+            ]
+            diff = neighbor_pts - X[:, None, :]
+            if isinstance(diff, ep.PyTorchTensor):
+                # https://github.com/jonasrauber/eagerpy/issues/6
+                distances = ep.astensor(diff.raw.norm(p, -1))
+            else:
+                distances = diff.norms.lp(p, -1)
+            if self.return_index:
+                return neighbors, distances.raw
+            else:
+                return distances.raw
+
+        metric = self.metric
+        k = self.k
+
+        if metric == "precomputed":
+            # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
+            if self.return_index:
+                n_jobs = self.params.get("n_jobs", 1)
+                # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?!
+                if n_jobs == 1:
+                    neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+                    if self.params.get("sort_results", True):
+                        X = numpy.take_along_axis(X, neighbors, axis=-1)
+                        ngb_order = numpy.argsort(X, axis=-1)
+                        neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+                    else:
+                        ngb_order = neighbors
+                    if self.return_distance:
+                        distances = numpy.take_along_axis(X, ngb_order, axis=-1)
+                        return neighbors, distances
+                    else:
+                        return neighbors
+                else:
+                    from joblib import Parallel, delayed, effective_n_jobs
+                    from sklearn.utils import gen_even_slices
+
+                    slices = gen_even_slices(len(X), effective_n_jobs(-1))
+                    parallel = Parallel(backend="threading", n_jobs=-1)
+                    if self.params.get("sort_results", True):
+
+                        def func(M):
+                            neighbors = numpy.argpartition(M, k - 1)[:, 0:k]
+                            Y = numpy.take_along_axis(M, neighbors, axis=-1)
+                            ngb_order = numpy.argsort(Y, axis=-1)
+                            return numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+
+                    else:
+
+                        def func(M):
+                            return numpy.argpartition(M, k - 1)[:, 0:k]
+
+                    neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+                    if self.return_distance:
+                        distances = numpy.take_along_axis(X, neighbors, axis=-1)
+                        return neighbors, distances
+                    else:
+                        return neighbors
+            if self.return_distance:
+                n_jobs = self.params.get("n_jobs", 1)
+                if n_jobs == 1:
+                    distances = numpy.partition(X, k - 1)[:, 0:k]
+                    if self.params.get("sort_results"):
+                        # partition is not guaranteed to sort the lower half, although it often does
+                        distances.sort(axis=-1)
+                else:
+                    from joblib import Parallel, delayed, effective_n_jobs
+                    from sklearn.utils import gen_even_slices
+
+                    if self.params.get("sort_results"):
+
+                        def func(M):
+                            # Not partitioning in place, because we should not modify the user's array?
+                            r = numpy.partition(M, k - 1)[:, 0:k]
+                            r.sort(axis=-1)
+                            return r
+
+                    else:
+                        func = lambda M: numpy.partition(M, k - 1)[:, 0:k]
+                    slices = gen_even_slices(len(X), effective_n_jobs(-1))
+                    parallel = Parallel(backend="threading", n_jobs=-1)
+                    distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+                return distances
+            return None
+
+        if self.params["implementation"] == "hnsw":
+            ef = self.params.get("ef")
+            if ef is not None:
+                self.graph.set_ef(ef)
+            neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"])
+            # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn,
+            # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue.
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, numpy.sqrt(distances)
+                else:
+                    return neighbors
+            if self.return_distance:
+                return numpy.sqrt(distances)
+            return None
+
+        if self.params["implementation"] == "keops":
+            import torch
+            from pykeops.torch import LazyTensor
+
+            # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+            XX = torch.as_tensor(X, dtype=torch.float32)
+            if X is self.ref_points:
+                YY = XX
+            else:
+                YY = torch.as_tensor(self.ref_points, dtype=torch.float32)
+            p = self.params["p"]
+            if p == numpy.inf:
+                # Requires pykeops 1.4 or later
+                mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1)
+            elif p == 2:  # Any even integer?
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1)
+            else:
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1)
+
+            if self.return_index:
+                if self.return_distance:
+                    distances, neighbors = mat.Kmin_argKmin(k, dim=1)
+                    if p != numpy.inf:
+                        distances = distances ** (1.0 / p)
+                    return neighbors, distances
+                else:
+                    neighbors = mat.argKmin(k, dim=1)
+                    return neighbors
+            if self.return_distance:
+                distances = mat.Kmin(k, dim=1)
+                if p != numpy.inf:
+                    distances = distances ** (1.0 / p)
+                return distances
+            return None
+
+        if self.params["implementation"] == "ckdtree":
+            qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}}
+            distances, neighbors = self.kdtree.query(X, k=self.k, **qargs)
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, distances
+                else:
+                    return neighbors
+            if self.return_distance:
+                return distances
+            return None
+
+        assert self.params["implementation"] == "sklearn"
+        if self.return_distance:
+            distances, neighbors = self.nn.kneighbors(X, return_distance=True)
+            if self.return_index:
+                return neighbors, distances
+            else:
+                return distances
+        if self.return_index:
+            neighbors = self.nn.kneighbors(X, return_distance=False)
+            return neighbors
+        return None
diff --git a/src/python/gudhi/wasserstein/wasserstein.py b/src/python/gudhi/wasserstein/wasserstein.py
index f0c82962..5b61d176 100644
--- a/src/python/gudhi/wasserstein/wasserstein.py
+++ b/src/python/gudhi/wasserstein/wasserstein.py
@@ -15,6 +15,17 @@ try:
 except ImportError:
     print("POT (Python Optimal Transport) package is not installed. Try to run $ conda install -c conda-forge pot ; or $ pip install POT")
 
+
+# Currently unused, but Théo says it is likely to be used again.
+def _proj_on_diag(X):
+    '''
+    :param X: (n x 2) array encoding the points of a persistent diagram.
+    :returns: (n x 2) array encoding the (respective orthogonal) projections of the points onto the diagonal
+    '''
+    Z = (X[:,0] + X[:,1]) / 2.
+    return np.array([Z , Z]).T
+
+
 def _dist_to_diag(X, internal_p):
     '''
     :param X: (n x 2) array encoding the points of a persistent diagram.
diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py
new file mode 100755
index 00000000..859189fa
--- /dev/null
+++ b/src/python/test/test_dtm.py
@@ -0,0 +1,68 @@
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+    Author(s):       Marc Glisse
+
+    Copyright (C) 2020 Inria
+
+    Modification(s):
+      - YYYY/MM Author: Description of the modification
+"""
+
+from gudhi.point_cloud.dtm import DistanceToMeasure
+import numpy
+import pytest
+import torch
+
+
+def test_dtm_compare_euclidean():
+    pts = numpy.random.rand(1000, 4)
+    k = 3
+    dtm = DistanceToMeasure(k, implementation="ckdtree")
+    r0 = dtm.fit_transform(pts)
+    dtm = DistanceToMeasure(k, implementation="sklearn")
+    r1 = dtm.fit_transform(pts)
+    assert r1 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="sklearn", algorithm="brute")
+    r2 = dtm.fit_transform(pts)
+    assert r2 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="hnsw")
+    r3 = dtm.fit_transform(pts)
+    assert r3 == pytest.approx(r0)
+    from scipy.spatial.distance import cdist
+
+    d = cdist(pts, pts)
+    dtm = DistanceToMeasure(k, metric="precomputed")
+    r4 = dtm.fit_transform(d)
+    assert r4 == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, metric="precomputed", n_jobs=2)
+    r4b = dtm.fit_transform(d)
+    assert r4b == pytest.approx(r0)
+    dtm = DistanceToMeasure(k, implementation="keops")
+    r5 = dtm.fit_transform(pts)
+    assert r5 == pytest.approx(r0)
+    pts2 = torch.tensor(pts, requires_grad=True)
+    assert pts2.grad is None
+    dtm = DistanceToMeasure(k, implementation="keops", enable_autodiff=True)
+    r6 = dtm.fit_transform(pts2)
+    assert r6.detach().numpy() == pytest.approx(r0)
+    r6.sum().backward()
+    assert not torch.isnan(pts2.grad).any()
+    pts2 = torch.tensor(pts, requires_grad=True)
+    assert pts2.grad is None
+    dtm = DistanceToMeasure(k, implementation="ckdtree", enable_autodiff=True)
+    r7 = dtm.fit_transform(pts2)
+    assert r7.detach().numpy() == pytest.approx(r0)
+    r7.sum().backward()
+    assert not torch.isnan(pts2.grad).any()
+
+
+def test_dtm_precomputed():
+    dist = numpy.array([[1.0, 3, 8], [1, 5, 5], [0, 2, 3]])
+    dtm = DistanceToMeasure(2, q=1, metric="neighbors")
+    r = dtm.fit_transform(dist)
+    assert r == pytest.approx([2.0, 3, 1])
+
+    dist = numpy.array([[2.0, 2], [0, 1], [3, 4]])
+    dtm = DistanceToMeasure(2, q=2, metric="neighbors")
+    r = dtm.fit_transform(dist)
+    assert r == pytest.approx([2.0, 0.707, 3.5355], rel=0.01)
diff --git a/src/python/test/test_knn.py b/src/python/test/test_knn.py
new file mode 100755
index 00000000..a87ec212
--- /dev/null
+++ b/src/python/test/test_knn.py
@@ -0,0 +1,130 @@
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+    Author(s):       Marc Glisse
+
+    Copyright (C) 2020 Inria
+
+    Modification(s):
+      - YYYY/MM Author: Description of the modification
+"""
+
+from gudhi.point_cloud.knn import KNearestNeighbors
+import numpy as np
+import pytest
+
+
+def test_knn_explicit():
+    base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]])
+    query = np.array([[1.0, 1], [2, 2], [4, 4]])
+    knn = KNearestNeighbors(2, metric="manhattan", return_distance=True, return_index=True)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert r[0] == pytest.approx(np.array([[0, 1], [1, 0], [3, 2]]))
+    assert r[1] == pytest.approx(np.array([[0.0, 1], [1, 2], [1, 2]]))
+
+    knn = KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+    r = (
+        KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops")
+        .fit(base)
+        .transform(query)
+    )
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+    r = (
+        KNearestNeighbors(2, metric="chebyshev", return_distance=True, return_index=False, implementation="keops", enable_autodiff=True)
+        .fit(base)
+        .transform(query)
+    )
+    assert r == pytest.approx(np.array([[0.0, 1], [1, 1], [1, 2]]))
+
+    knn = KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True)
+    knn.fit(base)
+    r = knn.transform(query)
+    assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]])
+    r = (
+        KNearestNeighbors(2, metric="minkowski", p=3, return_distance=False, return_index=True, implementation="keops")
+        .fit(base)
+        .transform(query)
+    )
+    assert np.array_equal(r, [[0, 1], [1, 0], [3, 2]])
+
+    dist = np.array([[0.0, 3, 8], [1, 0, 5], [1, 2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, sort_results=True)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]])
+    assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]])
+    # Second time in parallel
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=False, n_jobs=2, sort_results=True)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r, [[0, 1], [1, 0], [2, 0]])
+    knn = KNearestNeighbors(2, metric="precomputed", return_index=True, return_distance=True, n_jobs=2)
+    r = knn.fit_transform(dist)
+    assert np.array_equal(r[0], [[0, 1], [1, 0], [2, 0]])
+    assert np.array_equal(r[1], [[0, 3], [0, 1], [0, 1]])
+
+
+def test_knn_compare():
+    base = np.array([[1.0, 1], [1, 2], [4, 2], [4, 3]])
+    query = np.array([[1.0, 1], [2, 2], [4, 4]])
+    r0 = (
+        KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    r1 = (
+        KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    r2 = (
+        KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=False).fit(base).transform(query)
+    )
+    r3 = (
+        KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=False)
+        .fit(base)
+        .transform(query)
+    )
+    assert np.array_equal(r0, r1) and np.array_equal(r0, r2) and np.array_equal(r0, r3)
+
+    r0 = (
+        KNearestNeighbors(2, implementation="ckdtree", return_index=True, return_distance=True)
+        .fit(base)
+        .transform(query)
+    )
+    r1 = (
+        KNearestNeighbors(2, implementation="sklearn", return_index=True, return_distance=True)
+        .fit(base)
+        .transform(query)
+    )
+    r2 = KNearestNeighbors(2, implementation="hnsw", return_index=True, return_distance=True).fit(base).transform(query)
+    r3 = (
+        KNearestNeighbors(2, implementation="keops", return_index=True, return_distance=True).fit(base).transform(query)
+    )
+    assert np.array_equal(r0[0], r1[0]) and np.array_equal(r0[0], r2[0]) and np.array_equal(r0[0], r3[0])
+    d0 = pytest.approx(r0[1])
+    assert r1[1] == d0 and r2[1] == d0 and r3[1] == d0
+
+
+def test_knn_nop():
+    # This doesn't look super useful...
+    p = np.array([[0.0]])
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="sklearn"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="ckdtree"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="hnsw", ef=5
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, implementation="keops"
+    ).fit_transform(p)
+    assert None is KNearestNeighbors(
+        k=1, return_index=False, return_distance=False, metric="precomputed"
+    ).fit_transform(p)
diff --git a/src/python/test/test_wasserstein_distance.py b/src/python/test/test_wasserstein_distance.py
index c6d6b346..6bfcb2ee 100755
--- a/src/python/test/test_wasserstein_distance.py
+++ b/src/python/test/test_wasserstein_distance.py
@@ -8,6 +8,7 @@
       - YYYY/MM Author: Description of the modification
 """
 
+from gudhi.wasserstein.wasserstein import _proj_on_diag
 from gudhi.wasserstein import wasserstein_distance as pot
 from gudhi.hera import wasserstein_distance as hera
 import numpy as np
@@ -17,6 +18,12 @@ __author__ = "Theo Lacombe"
 __copyright__ = "Copyright (C) 2019 Inria"
 __license__ = "MIT"
 
+def test_proj_on_diag():
+    dgm = np.array([[1., 1.], [1., 2.], [3., 5.]])
+    assert np.array_equal(_proj_on_diag(dgm), [[1., 1.], [1.5, 1.5], [4., 4.]])
+    empty = np.empty((0, 2))
+    assert np.array_equal(_proj_on_diag(empty), empty)
+
 def _basic_wasserstein(wasserstein_distance, delta, test_infinity=True, test_matching=True):
     diag1 = np.array([[2.7, 3.7], [9.6, 14.0], [34.2, 34.974]])
     diag2 = np.array([[2.8, 4.45], [9.5, 14.1]])
author	Marc Glisse <marc.glisse@inria.fr>	2020-04-20 18:41:59 +0200
committer	Marc Glisse <marc.glisse@inria.fr>	2020-04-20 18:41:59 +0200
commit	0393fdd3da2b5e403757c0f3418919c81ccbdd76 (patch)
tree	008b752c6069b165efee50cb928adc8267343101 /src
parent	1086b8cad7c1ea2a02742dfc44aef036a674f5d3 (diff)
parent	93cd1240ef65d8883ec624e6e393c09969bf4d6f (diff)