no input_type

author: Marc Glisse <marc.glisse@inria.fr> 2020-05-21 23:50:12 +0200
committer: Marc Glisse <marc.glisse@inria.fr> 2020-05-21 23:50:12 +0200
commit: cdc289bb4b95943a01d9dc787255b5c3b8bd79ae (patch)
tree: f5d1122c4ae94fa62f84eec7078b2c797251ead3 /src/python/gudhi/clustering
parent: 8cd34b4f0a6f5ffe8cfc16d2bb5856e5f6400216 (diff)
1 files changed, 44 insertions, 30 deletions
diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
index 75296ab3..a05b9d32 100644
--- a/src/python/gudhi/clustering/tomato.py
+++ b/src/python/gudhi/clustering/tomato.py
@@ -30,7 +30,7 @@ class Tomato:
     children_: ndarray of shape (n_leaves_-1,2)
         The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i
     params_: dict
-        Parameters like input_type, metric, etc
+        Parameters like metric, etc
     """
 
     # Not documented for now, because I am not sure how useful it is.
@@ -39,9 +39,7 @@ class Tomato:
 
     def __init__(
         self,
-        # FIXME: fold input_type into metric
-        input_type="points",
-        metric=None,
+        metric="minkowski",
         graph_type="knn",
         density_type="logDTM",
         n_clusters=None,
@@ -54,27 +52,25 @@ class Tomato:
         Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later.
 
         Args:
-            input_type (str): 'points', 'distance_matrix' or 'neighbors'.
-            metric (None|Callable): If None, use Minkowski of parameter p.
-            graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'.
+            metric (str|Callable): Describes how to interpret the argument of `fit()`. Defaults to Minkowski of parameter p.
+            graph_type (str): 'manual', 'knn' or 'radius'. Ignored if metric is 'neighbors'.
             density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'.
             kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity.
             k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10.
             k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k.
             r (float): size of a neighborhood if graph_type is 'radius'.
-            eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU).
-            gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million).
+            eps (float): approximation factor when computing neighbors with a kd-tree (ignored in many cases).
             n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters.
             merge_threshold (float): minimum prominence of a cluster so it doesn't get merged.
             symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false.
-            p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2.
+            p (float): norm L^p on input points. Defaults to 2.
             q (float): order used to compute the distance to measure. Defaults to dim. Beware that when the dimension is large, this can easily cause overflows.
             dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "neighbors" or "precomputed").
-            n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1.
+            n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. If -1 is given all processors are used. Default: 1.
+            params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and :class:`~gudhi.point_cloud.dtm.DTMDensity`.
         """
         # Should metric='precomputed' mean input_type='distance_matrix'?
         # Should we be able to pass metric='minkowski' (what None does currently)?
-        self.input_type_ = input_type
         self.metric_ = metric
         self.graph_type_ = graph_type
         self.density_type_ = density_type
@@ -86,9 +82,10 @@ class Tomato:
             raise ValueError("Cannot specify both a merge threshold and a number of clusters")
 
     def fit(self, X, y=None, weights=None):
+        #FIXME: Iterable -> Sequence?
         """
         Args:
-            X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0), according to input_type.
+            X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if metric is "neighbors".
             weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point
         """
         # TODO: First detect if this is a new call with the same data (only threshold changed?)
@@ -100,22 +97,22 @@ class Tomato:
             if density_type == "manual":
                 raise ValueError("If density_type is 'manual', you must provide weights to fit()")
 
-        input_type = self.input_type_
-        if input_type == "points":
+        metric = self.metric_
+        if metric not in ["precomputed", "neighbors"]:
             self.points_ = X
 
         # FIXME: restrict this strongly
-        if input_type == "points" and self.metric_:
+        if metric not in ["precomputed", "neighbors", "minkowski", "euclidean"]:
             from sklearn.metrics import pairwise_distances
 
-            X = pairwise_distances(X, metric=self.metric_, n_jobs=self.params_.get("n_jobs"))
-            input_type = "distance_matrix"
+            X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs"))
+            metric = "precomputed"
 
         need_knn = 0
         need_knn_ngb = False
         need_knn_dist = False
         if self.graph_type_ == "knn":
-            k_graph = self.params_["k"]
+            k_graph = self.params_.get("k", 10) # FIXME: What if X has fewer than 10 points?
             need_knn = k_graph
             need_knn_ngb = True
         if self.density_type_ in ["DTM", "logDTM"]:
@@ -138,35 +135,53 @@ class Tomato:
             elif need_knn_dist:
                 knn_dist = knn
         if self.density_type_ in ["DTM", "logDTM"]:
-            if self.metric_ in ["neighbors", "precomputed"]:
+            if metric in ["neighbors", "precomputed"]:
                 dim = self.params_.get("dim", 2)
             else:
-                dim = len(X) # FIXME for distance matrix
+                dim = len(X)
             q = self.params_.get("q", dim)
             weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist)
             if self.density_type_ == "logDTM":
                 weights = numpy.log(weights)
 
-        if input_type == "distance_matrix" and self.graph_type_ == "radius":
-            # TODO: parallelize
-            X = numpy.array(X)
+        if metric == "precomputed" and self.graph_type_ == "radius":
+            # TODO: parallelize?
+            X = numpy.asarray(X)
             r = self.params_["r"]
             self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X]
 
-        if input_type == "neighbors":
+        if self.graph_type_ == "radius" and metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]:
+            from scipy.spatial import cKDTree
+            tree t = cKDTree(X)
+            # TODO: handle "l1" and "l2" aliases?
+            p = self.params_.get("p")
+            if metric == "euclidean":
+                assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'"
+                p = 2
+            elif metric == "manhattan":
+                assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'"
+                p = 1
+            elif metric == "chebyshev":
+                assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'"
+                p = numpy.inf
+            elif p is None:
+                p = 2 # the default
+            eps = self.params_.get("eps", 0)
+            self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps)
+
+        if metric == "neighbors":
             self.neighbors_ = X
             assert density_type == "manual"
 
         if self.density_type_ in {"KDE", "logKDE"}:
-            # FIXME: replace most assert with raise ValueError("blabla")
-            # assert input_type == "points"
+            assert metric not in ["neighbors", "precomputed"], "Scikit-learn's KernelDensity requires point coordinates"
             kde_params = self.params_.get("kde_params", dict())
             from sklearn.neighbors import KernelDensity
             weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_)
             if self.density_type_ == "KDE":
                 weights = numpy.exp(weights)
 
-        # TODO: do it at the C++ level and/or in parallel if this is too slow
+        # TODO: do it at the C++ level and/or in parallel if this is too slow?
         if self.params_.get("symmetrize_graph"):
             self.neighbors_ = [set(line) for line in self.neighbors_]
             for i, line in enumerate(self.neighbors_):
@@ -267,8 +282,7 @@ if __name__ == "__main__":
     a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)]
     a = numpy.random.rand(500, 2)
     t = Tomato(
-        input_type="points",
-        metric="Euclidean",
+        metric="euclidean",
         graph_type="knn",
         density_type="DTM",
         n_clusters=2,
author	Marc Glisse <marc.glisse@inria.fr>	2020-05-21 23:50:12 +0200
committer	Marc Glisse <marc.glisse@inria.fr>	2020-05-21 23:50:12 +0200
commit	cdc289bb4b95943a01d9dc787255b5c3b8bd79ae (patch)
tree	f5d1122c4ae94fa62f84eec7078b2c797251ead3 /src/python/gudhi/clustering
parent	8cd34b4f0a6f5ffe8cfc16d2bb5856e5f6400216 (diff)