Initial commit of tomato

author: Marc Glisse <marc.glisse@inria.fr> 2020-02-26 22:03:18 +0100
committer: Marc Glisse <marc.glisse@inria.fr> 2020-02-26 22:03:18 +0100
commit: 07c7e5841c961869b927875bbca91d10287f9fab (patch)
tree: d245deef34d9a20a6eef36a1747276e445d530e5 /src/python/gudhi/clustering
parent: 5ffe71dab84d9cbcaccd55324a73018b6915e886 (diff)
3 files changed, 598 insertions, 0 deletions
diff --git a/src/python/gudhi/clustering/__init__.py b/src/python/gudhi/clustering/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/python/gudhi/clustering/__init__.py
diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc
new file mode 100644
index 00000000..1c896174
--- /dev/null
+++ b/src/python/gudhi/clustering/_tomato.cc
@@ -0,0 +1,271 @@
+// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix`
+#include <boost/container/flat_map.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/property_map/property_map.hpp>
+#include <boost/property_map/transform_value_property_map.hpp>
+#include <boost/property_map/vector_property_map.hpp>
+#include <boost/property_map/function_property_map.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <vector>
+#include <unordered_map>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+
+namespace py=pybind11;
+
+template<class T, class=std::enable_if_t<std::is_integral<T>::value>>
+int getint(int i){return i;}
+template<class T, class=decltype(std::declval<T>().template cast<int>())>
+int getint(T i){return i.template cast<int>();}
+
+// Raw clusters are clusters obtained through single-linkage, no merging.
+
+typedef int Point_index;
+typedef int Cluster_index;
+struct Merge { Cluster_index first, second; double persist; };
+
+template<class Neighbors, class Density, class Order, class ROrder>
+auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){
+  // point index --> index of raw cluster it belongs to
+  std::vector<Cluster_index> raw_cluster; raw_cluster.reserve(num_points);
+  // cluster index --> index of top point in the cluster
+  Cluster_index n_raw_clusters = 0; // current number of raw clusters seen
+  //
+  std::vector<Merge> merges;
+  struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster
+  std::vector<Data> ds_base;
+  // boost::vector_property_map does resize(size+1) for every new element, don't use it
+  auto ds_data=boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n)->Data&{return ds_base[n];});
+  auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data);
+  auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data);
+  boost::disjoint_sets<decltype(ds_rank),decltype(ds_parent)> ds(ds_rank,ds_parent); // on the clusters, not directly the points
+  std::vector<std::array<double,2>> persistence; // diagram (finite points)
+  boost::container::flat_map<Cluster_index,Cluster_index> adj_clusters; // first: the merged cluster, second: the raw cluster
+  // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set
+
+  for(Point_index i = 0; i < num_points; ++i){
+    // auto&& ngb = neighbors[order[i]];
+    // TODO: have a specialization where we don't need python types and py::cast
+    // TODO: move py::cast and getint into Neighbors
+    py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work
+    adj_clusters.clear();
+    Point_index j = i; // highest neighbor
+    //for(Point_index k : ngb)
+    for(auto k_py : ngb)
+    {
+      Point_index k = rorder[getint(k_py)];
+      if(k >= i || k < 0) // ???
+	continue;
+      if(k < j)
+	j = k;
+      Cluster_index rk=raw_cluster[k];
+      adj_clusters.emplace(ds.find_set(rk),rk);
+      // does not insert if ck=ds.find_set(rk) already seen
+      // which rk we keep from those with the same ck is arbitrary
+    }
+    assert((Point_index)raw_cluster.size()==i);
+    if(i==j){ // local maximum -> new cluster
+      Cluster_index c = n_raw_clusters++;
+      ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access
+      ds.make_set(c);
+      ds_base[c].max=i; // max
+      raw_cluster.push_back(c);
+      continue;
+    }else{ // add i to the cluster of j
+      assert(j<i);
+      raw_cluster.push_back(raw_cluster[j]);
+      // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems. We could just not set j at all and use arbitrarily the first element of adj_clusters.
+    }
+    // possibly merge clusters
+    // we could sort, in case there are several merges, but that doesn't seem so useful
+    Cluster_index rj = raw_cluster[j];
+    Cluster_index cj = ds.find_set(rj);
+    Cluster_index orig_cj = cj;
+    for(auto ckk : adj_clusters){
+      Cluster_index rk = ckk.second;
+      Cluster_index ck = ckk.first;
+      if(ck==orig_cj)
+	continue;
+      assert(ck==ds.find_set(rk));
+      Point_index j = ds_base[cj].max;
+      Point_index k = ds_base[ck].max;
+      Point_index young = std::max(j,k);
+      Point_index old = std::min(j,k);
+      auto d_young = density[order[young]];
+      auto d_i = density[order[i]];
+      assert(d_young >= d_i);
+      // Always merge (the non-hierarchical algorithm would only conditionally merge here
+      persistence.push_back({d_young, d_i});
+      assert(ds.find_set(rj) != ds.find_set(rk));
+      ds.link(cj,ck);
+      cj = ds.find_set(cj);
+      ds_base[cj].max=old; // just one parent, no need for find_set
+      // record the raw clusters, we don't know what will have already been merged.
+      merges.push_back({rj, rk, d_young-d_i});
+    }
+  }
+  {
+    boost::counting_iterator<int> b(0), e(ds_base.size());
+    ds.compress_sets(b, e);
+    // Now we stop using find_sets and look at the parent directly
+    // rank is reused to rename clusters contiguously 0, 1, etc
+  }
+  // Maximum for each connected component
+  std::vector<double> max_cc;
+  //for(Point_index i = 0; i < num_points; ++i){
+  //  if(ds_base[ds_base[raw_cluster[i]].parent].max == i)
+  //    max_cc.push_back(density[order[i]]);
+  //}
+  for(Cluster_index i = 0; i < n_raw_clusters; ++i){
+    if(ds_base[i].parent == i)
+      max_cc.push_back(density[order[ds_base[i].max]]);
+  }
+  assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters);
+
+  // TODO: create a "noise" cluster, merging all those not prominent enough?
+  //int nb_clusters=0;
+  //for(int i=0;i<(int)ds_base.size();++i){
+  //  if(ds_parent[i]!=i) continue;
+  //  ds_data[i].rank=nb_clusters++;
+  //}
+
+  // Replay the merges, in increasing order of prominence, to build the hierarchy
+  std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;});
+  std::vector<std::array<Cluster_index,2>> children; children.reserve(merges.size());
+  {
+    struct Dat { Cluster_index parent; int rank; Cluster_index name; };
+    std::vector<Dat> ds_bas(2*n_raw_clusters-1);
+    Cluster_index i;
+    auto ds_dat=boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];});
+    auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat);
+    auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat);
+    boost::disjoint_sets<decltype(ds_ran),decltype(ds_par)> ds(ds_ran,ds_par);
+    for(i=0;i<n_raw_clusters;++i) { ds.make_set(i); ds_bas[i].name=i; }
+    for(Merge const& m : merges){
+      Cluster_index j = ds.find_set(m.first);
+      Cluster_index k = ds.find_set(m.second);
+      assert(j!=k);
+      children.push_back({ds_bas[j].name,ds_bas[k].name});
+      ds.make_set(i);
+      ds.link(i,j);
+      ds.link(i,k);
+      ds_bas[ds.find_set(i)].name=i;
+      ++i;
+    }
+  }
+
+  std::vector<Cluster_index> raw_cluster_ordered(num_points);
+  for(int i=0; i<num_points; ++i) raw_cluster_ordered[i]=raw_cluster[rorder[i]];
+  // return raw_cluster, children, persistence
+  // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042
+  return py::make_tuple(
+      py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
+      py::array(children.size(), children.data()),
+      py::array(persistence.size(), persistence.data()),
+      py::array(max_cc.size(), max_cc.data()));
+}
+
+auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final){
+  // Should this really be an error?
+  if(n_final > n_leaves)
+    throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters");
+  py::buffer_info cbuf = children.request();
+  if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0))
+    throw std::runtime_error("internal error: children have to be (n,2) or empty");
+  const int n_merges=cbuf.shape[0];
+  Cluster_index*d=(Cluster_index*)cbuf.ptr;
+  // Should this really be an error?
+  //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n';
+  if(n_merges + n_final < n_leaves)
+    throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges));
+
+  struct Dat { Cluster_index parent; int rank; int name; };
+  std::vector<Dat> ds_bas(2*n_leaves-1);
+  auto ds_dat=boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];});
+  auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat);
+  auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat);
+  boost::disjoint_sets<decltype(ds_ran),decltype(ds_par)> ds(ds_ran,ds_par);
+  Cluster_index i;
+  for(i=0;i<n_leaves;++i) { ds.make_set(i); ds_bas[i].name=-1; }
+  for(Cluster_index m=0; m < n_leaves-n_final; ++m){
+    Cluster_index j = ds.find_set(d[2*m]);
+    Cluster_index k = ds.find_set(d[2*m+1]);
+    assert(j!=k);
+    ds.make_set(i);
+    ds.link(i,j);
+    ds.link(i,k);
+    ++i;
+  }
+  Cluster_index next_cluster_name = 0;
+  std::vector<Cluster_index> ret; ret.reserve(n_leaves);
+  for(Cluster_index j=0;j<n_leaves;++j){
+    Cluster_index k = ds.find_set(j);
+    if(ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
+    ret.push_back(ds_bas[k].name);
+  }
+  return py::array(ret.size(), ret.data());
+}
+
+// Do a special version when ngb is a numpy array, where we can cast to int[k][n] ?
+auto plouf(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density){
+  // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted
+  // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to []
+  // (I _think_ handle is ok and we don't need object here)
+  py::buffer_info wbuf = density.request();
+  if(wbuf.ndim!=1)
+    throw std::runtime_error("density must be 1D");
+  const int n=wbuf.shape[0];
+  double*d=(double*)wbuf.ptr;
+  // Vector { 0, 1, ..., n-1 }
+  std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n));
+  // Permutation of the indices to get points in decreasing order of density
+  std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; });
+  // Inverse permutation
+  std::vector<Point_index> rorder(n);
+  for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
+  // Used as:
+  // order[i] is the index of the point with i-th largest density
+  // rorder[i] is the rank of the i-th point in order of decreasing density
+  // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output)
+  return tomato(n, ngb, d, order, rorder);
+}
+#if 0
+struct Vec {
+  int const* base;
+  int k;
+  int const* begin()const{return base;}
+  int const* end()const{return base+k;}
+  Vec operator*()const{return *this;}
+};
+auto plaf(py::array_t<int, py::array::c_style | py::array::forcecast> ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density, double threshold){
+  py::buffer_info nbuf = ngb.request();
+  if(nbuf.ndim!=2)
+    throw std::runtime_error("neighbors must be 2D");
+  const int n=nbuf.shape[0];
+  const int k=nbuf.shape[1];
+  int*nptr=(int*)nbuf.ptr;
+  auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};});
+  py::buffer_info wbuf = density.request();
+  if(wbuf.ndim!=1)
+    throw std::runtime_error("density must be 1D");
+  if(n!=wbuf.shape[0])
+    throw std::runtime_error("incompatible sizes");
+  double*d=(double*)wbuf.ptr;
+  return tomato(n,neighbors,d);
+}
+#endif
+PYBIND11_MODULE(_tomato, m) {
+  m.doc() = "Internals of tomato clustering";
+  m.def("doit", &plouf, "does the clustering");
+  //m.def("doit2", &plaf, "does the clustering faster");
+  m.def("merge", &merge, "merge clusters");
+}
+
+// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array
+//
+// py::isinstance<py::array_t<std::int32_t>> (ou py::isinstance<py::array> et tester dtype) et flags&c_style
+// ou overload (en virant forcecast?)
+// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ?
diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
new file mode 100644
index 00000000..e4430dbd
--- /dev/null
+++ b/src/python/gudhi/clustering/tomato.py
@@ -0,0 +1,327 @@
+import numpy
+from ._tomato import *
+
+# The fit/predict interface is not so well suited...
+class Tomato:
+    """
+    Clustering
+
+    This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet.
+
+    Attributes
+    ----------
+    n_clusters_: int
+        The number of clusters. Writing to it automatically adjusts labels_.
+    merge_threshold_: float
+        minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_.
+    n_leaves_: int
+        number of leaves in the hierarchical tree
+    leaf_labels_: ndarray of shape (n_samples)
+        cluster labels for each point, at the very bottom of the hierarchy
+    labels_: ndarray of shape (n_samples)
+        cluster labels for each point
+    diagram_: ndarray of shape (n_leaves_,2)
+        persistence diagram (only the finite points)
+    max_density_per_cc_: ndarray os shape (n_connected_components)
+        maximum of the density function on each connected component
+    children_: ndarray of shape (n_leaves_-1,2)
+        The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i
+    params_: dict
+        Parameters like input_type, metric, etc
+    """
+    def __init__(self, input_type='points', metric=None, graph_type=None, density_type='manual', n_clusters=None, merge_threshold=None, eliminate_threshold=None, **params):
+        """
+        Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later.
+
+        Parameters
+        ----------
+            input_type(str): 'points', 'distance_matrix' or 'neighbors'.
+            metric(str or callable): FIXME ???
+            graph_type(str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'.
+            density_type(str): 'manual', 'DTM', 'logDTM' or 'kde'.
+            kde_params(dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity.
+            k(int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10.
+            k_DTM(int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k.
+            r(float): size of a neighborhood if graph_type is 'radius'
+            eps(float): approximation factor when computing nearest neighbors without a GPU
+            gpu(bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million).
+            n_clusters(int): number of clusters requested. Defaults to ???
+            merge_threshold(float): minimum prominence of a cluster so it doesn't get merged.
+            eliminate_threshold(float): minimum height of a cluster so it doesn't get eliminated
+            symmetrize_graph(bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false.
+            p(float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2.
+            p_DTM(float): order used to compute the distance to measure. Defaults to 2.
+            n_jobs(int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1.
+        """
+        self.input_type_ = input_type
+        self.metric_ = metric
+        self.graph_type_ = graph_type
+        self.density_type_ = density_type
+        self.params_ = params
+        self.__n_clusters = n_clusters
+        self.__merge_threshold = merge_threshold
+        self.eliminate_threshold_ = eliminate_threshold
+        if n_clusters and merge_threshold:
+            raise ValueError("Cannot specify both a merge threshold and a number of clusters")
+
+    def fit(self, X, y=None, weights=None):
+        """
+        Parameters
+        ----------
+            X(?): points or distance_matrix or list of neighbors
+            weights(ndarray of shape (n_samples)): if density_type == 'manual', a density estimate at each point
+        """
+        # TODO: First detect if this is a new call with the same data (only threshold changed?)
+        # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters.
+        if weights:
+            density_type = 'manual'
+        else:
+            density_type = self.density_type_
+            assert density_type != 'manual'
+            if density_type == 'manual':
+                raise ValueError("If density_type is 'manual', you must provide weights to fit()")
+
+        if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'radius':
+            X = numpy.array(X)
+            r = self.params_['r']
+            self.neighbors_ = [numpy.nonzero(l <= r) for l in X]
+
+        if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'knn':
+            k = self.params_['k']
+            self.neighbors_ = numpy.argpartition(X, k-1)[:,0:k]
+
+        if self.input_type_ == 'neighbors':
+            self.neighbors_ = X
+            assert density_type == 'manual'
+
+        if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ in {'DTM', 'logDTM'}:
+            self.points_ = X
+            q = self.params_.get('p_DTM', 2)
+            p = self.params_.get('p', 2)
+            k = self.params_.get('k', 10)
+            k_DTM = self.params_.get('k_DTM', k)
+            kmax = max(k, k_DTM)
+            if self.params_.get('gpu'):
+                import torch
+                from pykeops.torch import LazyTensor
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    # Requires a version of pykeops strictly more recent than 1.3
+                    dd, nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin_argKmin(kmax, dim=1)
+                elif p == 2: # Any even integer?
+                    dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin_argKmin(kmax, dim=1)
+                else:
+                    dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin_argKmin(kmax, dim=1)
+                if k < kmax:
+                    nn = nn[:, 0:k]
+                if k_DTM < kmax:
+                    dd = dd[:, 0:k_DTM]
+                assert q != numpy.inf # for now
+                if p != numpy.inf:
+                    qp = float(q) / p
+                else:
+                    qp = q
+                if qp != 1:
+                    dd = dd**qp
+                weights = dd.sum(-1)
+                # **1/q is a waste of time, whether we take another **-.25 or a log
+
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                weights = numpy.array(weights)
+                self.neighbors_ = numpy.array(nn)
+            else: # CPU
+                from scipy.spatial import cKDTree
+                kdtree = cKDTree(self.points_)
+                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs)
+                if k < kmax:
+                    self.neighbors_ = self.neighbors_[:, 0:k]
+                if k_DTM < kmax:
+                    dd = dd[:, 0:k_DTM]
+                #weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd**q).sum(-1)
+
+            # TODO: check the formula in Fred's paper
+            if self.density_type_ == 'DTM':
+                weights = weights ** (-.25 / q)
+            else:
+                weights = -numpy.log(weights)
+
+        if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ not in {'DTM', 'logDTM'}:
+            self.points_ = X
+            p = self.params_.get('p', 2)
+            k = self.params_.get('k', 10)
+            if self.params_.get('gpu'):
+                import torch
+                from pykeops.torch import LazyTensor
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    # Requires a version of pykeops strictly more recent than 1.3
+                    nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).argKmin(k, dim=1)
+                elif p == 2: # Any even integer?
+                    nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).argKmin(k, dim=1)
+                else:
+                    nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).argKmin(k, dim=1)
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                self.neighbors_ = numpy.array(nn)
+            else: # CPU
+                from scipy.spatial import cKDTree
+                kdtree = cKDTree(self.points_)
+                # FIXME 'p'
+                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs)
+
+        if self.input_type_ == 'points' and self.graph_type_ != 'knn' and self.density_type_ in {'DTM', 'logDTM'}:
+            self.points_ = X
+            q = self.params_.get('p_DTM', 2)
+            p = self.params_.get('p', 2)
+            k = self.params_.get('k', 10)
+            k_DTM = self.params_.get('k_DTM', k)
+            if self.params_.get('gpu'):
+                import torch
+                from pykeops.torch import LazyTensor
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    assert False # Not supported???
+                    dd = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin(k_DTM, dim=1)
+                elif p == 2: # Any even integer?
+                    dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin(k_DTM, dim=1)
+                else:
+                    dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin(k_DTM, dim=1)
+                assert q != numpy.inf # for now
+                if p != numpy.inf:
+                    qp = float(q) / p
+                else:
+                    qp = q
+                if qp != 1:
+                    dd = dd**qp
+                weights = dd.sum(-1)
+                # **1/q is a waste of time, whether we take another **-.25 or a log
+
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                weights = numpy.array(weights)
+            else: # CPU
+                from scipy.spatial import cKDTree
+                kdtree = cKDTree(self.points_)
+                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs)
+                #weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd**q).sum(-1)
+
+            # TODO: check the formula in Fred's paper
+            if self.density_type_ == 'DTM':
+                weights = weights ** (-.25 / q)
+            else:
+                weights = -numpy.log(weights)
+
+        if self.input_type_ == 'distance_matrix' and self.density_type_ in {'DTM', 'logDTM'}:
+            q = self.params_.get('p_DTM', 2)
+            X = numpy.array(X)
+            k = self.params_.get('k_DTM')
+            if not k:
+                k = self.params_['k']
+            q = self.params_.get('p_DTM', 2)
+            weights = (numpy.partition(X)**q, k-1).sum(-1)
+            # TODO: check the formula in Fred's paper
+            if self.density_type_ == 'DTM':
+                weights = weights ** (-.25 / q)
+            else:
+                weights = -numpy.log(weights)
+
+        if self.density_type_ == 'kde':
+            # FIXME: replace most assert with raise ValueError("blabla")
+            assert self.input_type_ == 'points'
+            kde_params = self.params_.get('kde_params', dict())
+            from sklearn.neighbors import KernelDensity
+            weights = KernelDensity(**kde_params).fit(X).score_samples(X)
+
+        if self.params_.get('symmetrize_graph'):
+            self.neighbors_ = [set(line) for line in self.neighbors_]
+            for i,line in enumerate(self.neighbors_):
+                line.discard(i)
+                for j in line:
+                    self.neighbors_[j].add(i)
+
+        self.weights_=weights # TODO remove
+        self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(list(self.neighbors_), weights)
+        self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_)
+        assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_)
+        if self.__n_clusters:
+            renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters)
+            self.labels_ = renaming[self.leaf_labels_]
+        else:
+            self.labels_ = self.leaf_labels_
+            self.__n_clusters = self.n_leaves_
+
+    def fit_predict(self, X, y=None):
+        self.fit(X)
+        return labels_
+
+    # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k?
+    def plot_diagram(self):
+        import matplotlib.pyplot as plt
+        plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro')
+        l = self.diagram_[:,1].min()
+        r = max(self.diagram_[:,0].max(), self.max_density_per_cc_.max())
+        plt.plot([l,r],[l,r])
+        plt.plot(self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape,1.1*l-.1*r),'ro',color='green')
+        plt.show()
+
+#    def predict(self, X):
+#        # X had better be the same as in fit()
+#        return labels_
+
+    # Use set_params instead?
+    @property
+    def n_clusters_(self):
+        return self.__n_clusters
+    @n_clusters_.setter
+    def n_clusters_(self, n_clusters):
+        if n_clusters == self.__n_clusters:
+            return
+        self.__n_clusters = n_clusters
+        self.__merge_threshold = None
+        if hasattr(self, 'leaf_labels_'):
+            renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters)
+            self.labels_ = renaming[self.leaf_labels_]
+
+    @property
+    def merge_threshold_(self):
+        return self.__merge_threshold
+    @merge_threshold_.setter
+    def merge_threshold_(self, merge_threshold):
+        if merge_threshold == self.__merge_threshold:
+            return
+        if hasattr(self, 'leaf_labels_'):
+            self.n_clusters_ = numpy.count_nonzero(self.diagram_[1]-self.diagram_[0] > merge_threshold) + len(max_density_per_cc_)
+        else:
+            self.__n_clusters = None
+        self.__merge_threshold = merge_threshold
+
+
+if __name__ == '__main__':
+    import sys
+    a=[(1,2),(1.1,1.9),(.9,1.8),(10,0),(10.1,.05),(10.2,-.1),(5.4,0)]
+    a=numpy.random.rand(500,2)
+    t=Tomato(input_type='points', metric='Euclidean', graph_type='knn', density_type='DTM', n_clusters=2, k=4, n_jobs=-1, eps=.05)
+    t.fit(a)
+    #print("neighbors\n",t.neighbors_)
+    #print()
+    #print("weights\n",t.weights_)
+    #print()
+    #print("diagram\n",t.diagram_)
+    #print()
+    print("max\n",t.max_density_per_cc_, file=sys.stderr)
+    #print()
+    print("leaf labels\n",t.leaf_labels_)
+    #print()
+    print("labels\n",t.labels_)
+    #print()
+    print("children\n",t.children_)
+    #print()
+    t.n_clusters_ = 2
+    print("labels\n",t.labels_)
+    t.plot_diagram()
author	Marc Glisse <marc.glisse@inria.fr>	2020-02-26 22:03:18 +0100
committer	Marc Glisse <marc.glisse@inria.fr>	2020-02-26 22:03:18 +0100
commit	07c7e5841c961869b927875bbca91d10287f9fab (patch)
tree	d245deef34d9a20a6eef36a1747276e445d530e5 /src/python/gudhi/clustering
parent	5ffe71dab84d9cbcaccd55324a73018b6915e886 (diff)