From c8c942c43643131a7ef9899826a7095e497150fe Mon Sep 17 00:00:00 2001
From: Marc Glisse <marc.glisse@inria.fr>
Date: Thu, 26 Mar 2020 22:10:26 +0100
Subject: cmake

---
 .../modules/GUDHI_third_party_libraries.cmake      |   3 +
 src/python/CMakeLists.txt                          |  14 ++
 src/python/gudhi/point_cloud/dtm.py                |  40 +++++
 src/python/gudhi/point_cloud/knn.py                | 193 +++++++++++++++++++++
 src/python/test/test_dtm.py                        |  32 ++++
 5 files changed, 282 insertions(+)
 create mode 100644 src/python/gudhi/point_cloud/dtm.py
 create mode 100644 src/python/gudhi/point_cloud/knn.py
 create mode 100755 src/python/test/test_dtm.py

diff --git a/src/cmake/modules/GUDHI_third_party_libraries.cmake b/src/cmake/modules/GUDHI_third_party_libraries.cmake
index 2d010483..c2039674 100644
--- a/src/cmake/modules/GUDHI_third_party_libraries.cmake
+++ b/src/cmake/modules/GUDHI_third_party_libraries.cmake
@@ -160,6 +160,9 @@ if( PYTHONINTERP_FOUND )
   find_python_module("sklearn")
   find_python_module("ot")
   find_python_module("pybind11")
+  find_python_module("torch")
+  find_python_module("hnswlib")
+  find_python_module("pykeops")
 endif()
 
 if(NOT GUDHI_PYTHON_PATH)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index f00966a5..d26d3e6e 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -78,6 +78,15 @@ if(PYTHONINTERP_FOUND)
     if(OT_FOUND)
       add_gudhi_debug_info("POT version ${OT_VERSION}")
     endif()
+    if(HNSWLIB_FOUND)
+      add_gudhi_debug_info("HNSWlib version ${OT_VERSION}")
+    endif()
+    if(TORCH_FOUND)
+      add_gudhi_debug_info("PyTorch version ${OT_VERSION}")
+    endif()
+    if(PYKEOPS_FOUND)
+      add_gudhi_debug_info("PyKeOps version ${OT_VERSION}")
+    endif()
 
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_RESULT_OF_USE_DECLTYPE', ")
     set(GUDHI_PYTHON_EXTRA_COMPILE_ARGS "${GUDHI_PYTHON_EXTRA_COMPILE_ARGS}'-DBOOST_ALL_NO_LIB', ")
@@ -399,6 +408,11 @@ if(PYTHONINTERP_FOUND)
     # Time Delay
     add_gudhi_py_test(test_time_delay)
 
+    # DTM
+    if(SCIPY_FOUND AND SKLEARN_FOUND AND TORCH_FOUND AND HNSWLIB_FOUND AND PYKEOPS_FOUND)
+      add_gudhi_py_test(test_dtm)
+    endif()
+
     # Documentation generation is available through sphinx - requires all modules
     if(SPHINX_PATH)
       if(MATPLOTLIB_FOUND)
diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py
new file mode 100644
index 00000000..08f9ea60
--- /dev/null
+++ b/src/python/gudhi/point_cloud/dtm.py
@@ -0,0 +1,40 @@
+from .knn import KNN
+
+
+class DTM:
+    def __init__(self, k, q=2, **kwargs):
+        """
+        Args:
+            q (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'.
+            kwargs: Same parameters as KNN, except that metric="neighbors" means that transform() expects an array with the distances to the k nearest neighbors.
+        """
+        self.k = k
+        self.q = q
+        self.params = kwargs
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for mass points
+        """
+        if self.params.setdefault("metric", "euclidean") != "neighbors":
+            self.knn = KNN(self.k, return_index=False, return_distance=True, **self.params)
+            self.knn.fit(X)
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed", or distances to the k nearest neighbors if metric is "neighbors" (if the array has more than k columns, the remaining ones are ignored).
+        """
+        if self.params["metric"] == "neighbors":
+            distances = X[:, : self.k]
+        else:
+            distances = self.knn.transform(X)
+        distances = distances ** self.q
+        dtm = distances.sum(-1) / self.k
+        dtm = dtm ** (1.0 / self.q)
+        return dtm
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
new file mode 100644
index 00000000..57078f1e
--- /dev/null
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -0,0 +1,193 @@
+import numpy
+
+
+class KNN:
+    def __init__(self, k, return_index=True, return_distance=False, metric="euclidean", **kwargs):
+        """
+        Args:
+            k (int): number of neighbors (including the point itself).
+            return_index (bool): if True, return the index of each neighbor.
+            return_distance (bool): if True, return the distance to each neighbor.
+            implementation (str): Choice of the library that does the real work.
+
+                * 'keops' for a brute-force, CUDA implementation through pykeops. Useful when the dimension becomes
+                    large (10+) but the number of points remains low (less than a million).
+                    Only "minkowski" and its aliases are supported.
+                * 'ckdtree' for scipy's cKDTree. Only "minkowski" and its aliases are supported.
+                * 'sklearn' for scikit-learn's NearestNeighbors.
+                    Note that this provides in particular an option algorithm="brute".
+                * 'hnsw' for hnswlib.Index. It is very fast but does not provide guarantees.
+                    Only supports "euclidean" for now.
+                * None will try to select a sensible one (scipy if possible, scikit-learn otherwise).
+            metric (str): see `sklearn.neighbors.NearestNeighbors`.
+            eps (float): relative error when computing nearest neighbors with the cKDTree.
+            p (float): norm L^p on input points (including numpy.inf) if metric is "minkowski". Defaults to 2.
+            n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU.
+                If -1 is given all processors are used. Default: 1.
+
+        Additional parameters are forwarded to the backends.
+        """
+        self.k = k
+        self.return_index = return_index
+        self.return_distance = return_distance
+        self.metric = metric
+        self.params = kwargs
+        # canonicalize
+        if metric == "euclidean":
+            self.params["p"] = 2
+            self.metric = "minkowski"
+        elif metric == "manhattan":
+            self.params["p"] = 1
+            self.metric = "minkowski"
+        elif metric == "chebyshev":
+            self.params["p"] = numpy.inf
+            self.metric = "minkowski"
+        elif metric == "minkowski":
+            self.params["p"] = kwargs.get("p", 2)
+        if self.params.get("implementation") in {"keops", "ckdtree"}:
+            assert self.metric == "minkowski"
+        if self.params.get("implementation") == "hnsw":
+            assert self.metric == "minkowski" and self.params["p"] == 2
+        if not self.params.get("implementation"):
+            if self.metric == "minkowski":
+                self.params["implementation"] = "ckdtree"
+            else:
+                self.params["implementation"] = "sklearn"
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X).transform(X)
+
+    def fit(self, X, y=None):
+        """
+        Args:
+            X (numpy.array): coordinates for reference points
+        """
+        self.ref_points = X
+        if self.params.get("implementation") == "ckdtree":
+            # sklearn could handle this, but it is much slower
+            from scipy.spatial import cKDTree
+            self.kdtree = cKDTree(X)
+
+        if self.params.get("implementation") == "sklearn" and self.metric != "precomputed":
+            # FIXME: sklearn badly handles "precomputed"
+            from sklearn.neighbors import NearestNeighbors
+
+            nargs = {k: v for k, v in self.params.items() if k in {"p", "n_jobs", "metric_params", "algorithm", "leaf_size"}}
+            self.nn = NearestNeighbors(self.k, metric=self.metric, **nargs)
+            self.nn.fit(X)
+
+        if self.params.get("implementation") == "hnsw":
+            import hnswlib
+            self.graph = hnswlib.Index("l2", len(X[0])) # Actually returns squared distances
+            self.graph.init_index(len(X), **{k:v for k,v in self.params.items() if k in {"ef_construction", "M", "random_seed"}})
+            n = self.params.get("num_threads")
+            if n is None:
+                n = self.params.get("n_jobs", 1)
+                self.params["num_threads"] = n
+            self.graph.add_items(X, num_threads=n)
+
+        return self
+
+    def transform(self, X):
+        """
+        Args:
+            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed"
+        """
+        metric = self.metric
+        k = self.k
+
+        if metric == "precomputed":
+            # scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
+            X = numpy.array(X)
+            if self.return_index:
+                neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+                distances = numpy.take_along_axis(X, neighbors, axis=-1)
+                ngb_order = numpy.argsort(distances, axis=-1)
+                neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+                if self.return_distance:
+                    distances = numpy.take_along_axis(distances, ngb_order, axis=-1)
+                    return neighbors, distances
+                else:
+                    return neighbors
+            if self.return_distance:
+                distances = numpy.partition(X, k - 1)[:, 0:k]
+                # partition is not guaranteed to sort the lower half, although it often does
+                distances.sort(axis=-1)
+                return distances
+            return None
+
+        if self.params.get("implementation") == "hnsw":
+            ef = self.params.get("ef")
+            if ef is not None:
+                self.graph.set_ef(ef)
+            neighbors, distances = self.graph.knn_query(X, k, num_threads=self.params["num_threads"])
+            # The k nearest neighbors are always sorted. I couldn't find it in the doc, but the code calls searchKnn,
+            # which returns a priority_queue, and then fills the return array backwards with top/pop on the queue.
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, numpy.sqrt(distances)
+                else:
+                    return neighbors
+            if self.return_distance:
+                return numpy.sqrt(distances)
+            return None
+
+        if self.params.get("implementation") == "keops":
+            import torch
+            from pykeops.torch import LazyTensor
+
+            # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+            XX = torch.tensor(X, dtype=torch.float32)
+            if X is self.ref_points:
+                YY = XX
+            else:
+                YY = torch.tensor(self.ref_points, dtype=torch.float32)
+
+            p = self.params["p"]
+            if p == numpy.inf:
+                # Requires a version of pykeops strictly more recent than 1.3
+                mat = (LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs().max(-1)
+            elif p == 2:  # Any even integer?
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])) ** p).sum(-1)
+            else:
+                mat = ((LazyTensor(XX[:, None, :]) - LazyTensor(YY[None, :, :])).abs() ** p).sum(-1)
+
+            if self.return_index:
+                if self.return_distance:
+                    distances, neighbors = mat.Kmin_argKmin(k, dim=1)
+                    if p != numpy.inf:
+                        distances = distances ** (1.0 / p)
+                    return neighbors, distances
+                else:
+                    neighbors = mat.argKmin(k, dim=1)
+                    return neighbors
+            if self.return_distance:
+                distances = mat.Kmin(k, dim=1)
+                if p != numpy.inf:
+                    distances = distances ** (1.0 / p)
+                return distances
+            return None
+        # FIXME: convert everything back to numpy arrays or not?
+
+        if hasattr(self, "kdtree"):
+            qargs = {key: val for key, val in self.params.items() if key in {"p", "eps", "n_jobs"}}
+            distances, neighbors = self.kdtree.query(X, k=self.k, **qargs)
+            if self.return_index:
+                if self.return_distance:
+                    return neighbors, distances
+                else:
+                    return neighbors
+            if self.return_distance:
+                return distances
+            return None
+
+        if self.return_distance:
+            distances, neighbors = self.nn.kneighbors(X, return_distance=True)
+            if self.return_index:
+                return neighbors, distances
+            else:
+                return distances
+        if self.return_index:
+            neighbors = self.nn.kneighbors(X, return_distance=False)
+            return neighbors
+        return None
diff --git a/src/python/test/test_dtm.py b/src/python/test/test_dtm.py
new file mode 100755
index 00000000..57fdd131
--- /dev/null
+++ b/src/python/test/test_dtm.py
@@ -0,0 +1,32 @@
+""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+    Author(s):       Marc Glisse
+
+    Copyright (C) 2020 Inria
+
+    Modification(s):
+      - YYYY/MM Author: Description of the modification
+"""
+
+from gudhi.point_cloud.dtm import DTM
+import numpy
+
+
+def test_dtm_euclidean():
+    pts = numpy.random.rand(1000,4)
+    k = 3
+    dtm = DTM(k,implementation="ckdtree")
+    print(dtm.fit_transform(pts))
+    dtm = DTM(k,implementation="sklearn")
+    print(dtm.fit_transform(pts))
+    dtm = DTM(k,implementation="sklearn",algorithm="brute")
+    print(dtm.fit_transform(pts))
+    dtm = DTM(k,implementation="hnsw")
+    print(dtm.fit_transform(pts))
+    from scipy.spatial.distance import cdist
+    d = cdist(pts,pts)
+    dtm = DTM(k,metric="precomputed")
+    print(dtm.fit_transform(d))
+    dtm = DTM(k,implementation="keops")
+    print(dtm.fit_transform(pts))
+
-- 
cgit v1.2.3