formatting

author: Marc Glisse <marc.glisse@inria.fr> 2020-02-26 23:24:42 +0100
committer: Marc Glisse <marc.glisse@inria.fr> 2020-02-26 23:24:42 +0100
commit: e5e0f9a9e96389eadc9e9c4bc493b88abcb6f89a (patch)
tree: 44dda3c7a0cf6992d8caaed1cf2ee7559b06975d /src/python/gudhi/clustering
parent: 53579deb2d551752b503b7b76ac04885ec354470 (diff)
2 files changed, 299 insertions, 210 deletions
diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc
index 746c4254..87bd62e9 100644
--- a/src/python/gudhi/clustering/_tomato.cc
+++ b/src/python/gudhi/clustering/_tomato.cc
@@ -14,100 +14,117 @@
 #include <pybind11/numpy.h>
 #include <iostream>
 
-namespace py=pybind11;
+namespace py = pybind11;
 
-template<class T, class=std::enable_if_t<std::is_integral<T>::value>>
-int getint(int i){return i;}
+template <class T, class = std::enable_if_t<std::is_integral<T>::value>>
+int getint(int i) {
+  return i;
+}
 // Gcc-8 has a bug that breaks this version, fixed in gcc-9
 // template<class T, class=decltype(std::declval<T>().template cast<int>())>
 // int getint(T i){return i.template cast<int>();}
-template<class T>
-auto getint(T i)->decltype(i.template cast<int>()){return i.template cast<int>();}
+template <class T>
+auto getint(T i) -> decltype(i.template cast<int>()) {
+  return i.template cast<int>();
+}
 
 // Raw clusters are clusters obtained through single-linkage, no merging.
 
 typedef int Point_index;
 typedef int Cluster_index;
-struct Merge { Cluster_index first, second; double persist; };
+struct Merge {
+  Cluster_index first, second;
+  double persist;
+};
 
-template<class Neighbors, class Density, class Order, class ROrder>
-auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){
+template <class Neighbors, class Density, class Order, class ROrder>
+auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order,
+            ROrder const& rorder) {
   // point index --> index of raw cluster it belongs to
-  std::vector<Cluster_index> raw_cluster; raw_cluster.reserve(num_points);
+  std::vector<Cluster_index> raw_cluster;
+  raw_cluster.reserve(num_points);
   // cluster index --> index of top point in the cluster
-  Cluster_index n_raw_clusters = 0; // current number of raw clusters seen
+  Cluster_index n_raw_clusters = 0;  // current number of raw clusters seen
   //
   std::vector<Merge> merges;
-  struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster
+  struct Data {
+    Cluster_index parent;
+    int rank;
+    Point_index max;
+  };  // information on a cluster
   std::vector<Data> ds_base;
   // boost::vector_property_map does resize(size+1) for every new element, don't use it
-  auto ds_data=boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n)->Data&{return ds_base[n];});
-  auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data);
-  auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data);
-  boost::disjoint_sets<decltype(ds_rank),decltype(ds_parent)> ds(ds_rank,ds_parent); // on the clusters, not directly the points
-  std::vector<std::array<double,2>> persistence; // diagram (finite points)
-  boost::container::flat_map<Cluster_index,Cluster_index> adj_clusters; // first: the merged cluster, second: the raw cluster
-  // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set
+  auto ds_data =
+      boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n) -> Data& { return ds_base[n]; });
+  auto ds_parent =
+      boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data);
+  auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data);
+  boost::disjoint_sets<decltype(ds_rank), decltype(ds_parent)> ds(
+      ds_rank, ds_parent);                         // on the clusters, not directly the points
+  std::vector<std::array<double, 2>> persistence;  // diagram (finite points)
+  boost::container::flat_map<Cluster_index, Cluster_index>
+      adj_clusters;  // first: the merged cluster, second: the raw cluster
+  // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only
+  // insert in the vector if merged is absent from the set
 
-  for(Point_index i = 0; i < num_points; ++i){
+  for (Point_index i = 0; i < num_points; ++i) {
     // auto&& ngb = neighbors[order[i]];
     // TODO: have a specialization where we don't need python types and py::cast
     // TODO: move py::cast and getint into Neighbors
-    py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work
+    py::object ngb = neighbors[py::cast(order[i])];  // auto&& also seems to work
     adj_clusters.clear();
-    Point_index j = i; // highest neighbor
-    //for(Point_index k : ngb)
-    for(auto k_py : ngb)
-    {
+    Point_index j = i;  // highest neighbor
+    // for(Point_index k : ngb)
+    for (auto k_py : ngb) {
       Point_index k = rorder[getint(k_py)];
-      if(k >= i || k < 0) // ???
-	continue;
-      if(k < j)
-	j = k;
-      Cluster_index rk=raw_cluster[k];
-      adj_clusters.emplace(ds.find_set(rk),rk);
+      if (k >= i || k < 0)  // ???
+        continue;
+      if (k < j) j = k;
+      Cluster_index rk = raw_cluster[k];
+      adj_clusters.emplace(ds.find_set(rk), rk);
       // does not insert if ck=ds.find_set(rk) already seen
       // which rk we keep from those with the same ck is arbitrary
     }
-    assert((Point_index)raw_cluster.size()==i);
-    if(i==j){ // local maximum -> new cluster
+    assert((Point_index)raw_cluster.size() == i);
+    if (i == j) {  // local maximum -> new cluster
       Cluster_index c = n_raw_clusters++;
-      ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access
+      ds_base.emplace_back();  // could be integrated in ds_data, but then we would check the size for every access
       ds.make_set(c);
-      ds_base[c].max=i; // max
+      ds_base[c].max = i;  // max
       raw_cluster.push_back(c);
       continue;
-    }else{ // add i to the cluster of j
-      assert(j<i);
+    } else {  // add i to the cluster of j
+      assert(j < i);
       raw_cluster.push_back(raw_cluster[j]);
-      // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems. We could just not set j at all and use arbitrarily the first element of adj_clusters.
+      // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may
+      // merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems.
+      // We could just not set j at all and use arbitrarily the first element of adj_clusters.
     }
     // possibly merge clusters
     // we could sort, in case there are several merges, but that doesn't seem so useful
     Cluster_index rj = raw_cluster[j];
     Cluster_index cj = ds.find_set(rj);
     Cluster_index orig_cj = cj;
-    for(auto ckk : adj_clusters){
+    for (auto ckk : adj_clusters) {
       Cluster_index rk = ckk.second;
       Cluster_index ck = ckk.first;
-      if(ck==orig_cj)
-	continue;
-      assert(ck==ds.find_set(rk));
+      if (ck == orig_cj) continue;
+      assert(ck == ds.find_set(rk));
       Point_index j = ds_base[cj].max;
       Point_index k = ds_base[ck].max;
-      Point_index young = std::max(j,k);
-      Point_index old = std::min(j,k);
+      Point_index young = std::max(j, k);
+      Point_index old = std::min(j, k);
       auto d_young = density[order[young]];
       auto d_i = density[order[i]];
       assert(d_young >= d_i);
       // Always merge (the non-hierarchical algorithm would only conditionally merge here
       persistence.push_back({d_young, d_i});
       assert(ds.find_set(rj) != ds.find_set(rk));
-      ds.link(cj,ck);
+      ds.link(cj, ck);
       cj = ds.find_set(cj);
-      ds_base[cj].max=old; // just one parent, no need for find_set
+      ds_base[cj].max = old;  // just one parent, no need for find_set
       // record the raw clusters, we don't know what will have already been merged.
-      merges.push_back({rj, rk, d_young-d_i});
+      merges.push_back({rj, rk, d_young - d_i});
     }
   }
   {
@@ -118,121 +135,137 @@ auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& de
   }
   // Maximum for each connected component
   std::vector<double> max_cc;
-  //for(Point_index i = 0; i < num_points; ++i){
+  // for(Point_index i = 0; i < num_points; ++i){
   //  if(ds_base[ds_base[raw_cluster[i]].parent].max == i)
   //    max_cc.push_back(density[order[i]]);
   //}
-  for(Cluster_index i = 0; i < n_raw_clusters; ++i){
-    if(ds_base[i].parent == i)
-      max_cc.push_back(density[order[ds_base[i].max]]);
+  for (Cluster_index i = 0; i < n_raw_clusters; ++i) {
+    if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]);
   }
-  assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters);
+  assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters);
 
   // TODO: create a "noise" cluster, merging all those not prominent enough?
-  //int nb_clusters=0;
-  //for(int i=0;i<(int)ds_base.size();++i){
+  // int nb_clusters=0;
+  // for(int i=0;i<(int)ds_base.size();++i){
   //  if(ds_parent[i]!=i) continue;
   //  ds_data[i].rank=nb_clusters++;
   //}
 
   // Replay the merges, in increasing order of prominence, to build the hierarchy
-  std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;});
-  std::vector<std::array<Cluster_index,2>> children; children.reserve(merges.size());
+  std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; });
+  std::vector<std::array<Cluster_index, 2>> children;
+  children.reserve(merges.size());
   {
-    struct Dat { Cluster_index parent; int rank; Cluster_index name; };
-    std::vector<Dat> ds_bas(2*n_raw_clusters-1);
+    struct Dat {
+      Cluster_index parent;
+      int rank;
+      Cluster_index name;
+    };
+    std::vector<Dat> ds_bas(2 * n_raw_clusters - 1);
     Cluster_index i;
-    auto ds_dat=boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];});
-    auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat);
-    auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat);
-    boost::disjoint_sets<decltype(ds_ran),decltype(ds_par)> ds(ds_ran,ds_par);
-    for(i=0;i<n_raw_clusters;++i) { ds.make_set(i); ds_bas[i].name=i; }
-    for(Merge const& m : merges){
+    auto ds_dat =
+        boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+    auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+    auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+    boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
+    for (i = 0; i < n_raw_clusters; ++i) {
+      ds.make_set(i);
+      ds_bas[i].name = i;
+    }
+    for (Merge const& m : merges) {
       Cluster_index j = ds.find_set(m.first);
       Cluster_index k = ds.find_set(m.second);
-      assert(j!=k);
-      children.push_back({ds_bas[j].name,ds_bas[k].name});
+      assert(j != k);
+      children.push_back({ds_bas[j].name, ds_bas[k].name});
       ds.make_set(i);
-      ds.link(i,j);
-      ds.link(i,k);
-      ds_bas[ds.find_set(i)].name=i;
+      ds.link(i, j);
+      ds.link(i, k);
+      ds_bas[ds.find_set(i)].name = i;
       ++i;
     }
   }
 
   std::vector<Cluster_index> raw_cluster_ordered(num_points);
-  for(int i=0; i<num_points; ++i) raw_cluster_ordered[i]=raw_cluster[rorder[i]];
+  for (int i = 0; i < num_points; ++i) raw_cluster_ordered[i] = raw_cluster[rorder[i]];
   // return raw_cluster, children, persistence
   // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042
-  return py::make_tuple(
-      py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
-      py::array(children.size(), children.data()),
-      py::array(persistence.size(), persistence.data()),
-      py::array(max_cc.size(), max_cc.data()));
+  return py::make_tuple(py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
+                        py::array(children.size(), children.data()), py::array(persistence.size(), persistence.data()),
+                        py::array(max_cc.size(), max_cc.data()));
 }
 
-auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final){
+auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final) {
   // Should this really be an error?
-  if(n_final > n_leaves)
+  if (n_final > n_leaves)
     throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters");
   py::buffer_info cbuf = children.request();
-  if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0))
+  if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0))
     throw std::runtime_error("internal error: children have to be (n,2) or empty");
-  const int n_merges=cbuf.shape[0];
-  Cluster_index*d=(Cluster_index*)cbuf.ptr;
+  const int n_merges = cbuf.shape[0];
+  Cluster_index* d = (Cluster_index*)cbuf.ptr;
   // Should this really be an error?
-  //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n';
-  if(n_merges + n_final < n_leaves)
-    throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges));
+  // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n';
+  if (n_merges + n_final < n_leaves)
+    throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) +
+                             " is smaller than the number of connected components " +
+                             std::to_string(n_leaves - n_merges));
 
-  struct Dat { Cluster_index parent; int rank; int name; };
-  std::vector<Dat> ds_bas(2*n_leaves-1);
-  auto ds_dat=boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];});
-  auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat);
-  auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat);
-  boost::disjoint_sets<decltype(ds_ran),decltype(ds_par)> ds(ds_ran,ds_par);
+  struct Dat {
+    Cluster_index parent;
+    int rank;
+    int name;
+  };
+  std::vector<Dat> ds_bas(2 * n_leaves - 1);
+  auto ds_dat = boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+  auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+  auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+  boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
   Cluster_index i;
-  for(i=0;i<n_leaves;++i) { ds.make_set(i); ds_bas[i].name=-1; }
-  for(Cluster_index m=0; m < n_leaves-n_final; ++m){
-    Cluster_index j = ds.find_set(d[2*m]);
-    Cluster_index k = ds.find_set(d[2*m+1]);
-    assert(j!=k);
+  for (i = 0; i < n_leaves; ++i) {
     ds.make_set(i);
-    ds.link(i,j);
-    ds.link(i,k);
+    ds_bas[i].name = -1;
+  }
+  for (Cluster_index m = 0; m < n_leaves - n_final; ++m) {
+    Cluster_index j = ds.find_set(d[2 * m]);
+    Cluster_index k = ds.find_set(d[2 * m + 1]);
+    assert(j != k);
+    ds.make_set(i);
+    ds.link(i, j);
+    ds.link(i, k);
     ++i;
   }
   Cluster_index next_cluster_name = 0;
-  std::vector<Cluster_index> ret; ret.reserve(n_leaves);
-  for(Cluster_index j=0;j<n_leaves;++j){
+  std::vector<Cluster_index> ret;
+  ret.reserve(n_leaves);
+  for (Cluster_index j = 0; j < n_leaves; ++j) {
     Cluster_index k = ds.find_set(j);
-    if(ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
+    if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
     ret.push_back(ds_bas[k].name);
   }
   return py::array(ret.size(), ret.data());
 }
 
 // Do a special version when ngb is a numpy array, where we can cast to int[k][n] ?
-auto plouf(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density){
+auto plouf(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density) {
   // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted
   // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to []
   // (I _think_ handle is ok and we don't need object here)
   py::buffer_info wbuf = density.request();
-  if(wbuf.ndim!=1)
-    throw std::runtime_error("density must be 1D");
-  const int n=wbuf.shape[0];
-  double*d=(double*)wbuf.ptr;
+  if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D");
+  const int n = wbuf.shape[0];
+  double* d = (double*)wbuf.ptr;
   // Vector { 0, 1, ..., n-1 }
   std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n));
   // Permutation of the indices to get points in decreasing order of density
-  std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; });
+  std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; });
   // Inverse permutation
   std::vector<Point_index> rorder(n);
-  for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
+  for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
   // Used as:
   // order[i] is the index of the point with i-th largest density
   // rorder[i] is the rank of the i-th point in order of decreasing density
-  // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output)
+  // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the
+  // output)
   return tomato(n, ngb, d, order, rorder);
 }
 #if 0
@@ -263,7 +296,7 @@ auto plaf(py::array_t<int, py::array::c_style | py::array::forcecast> ngb, py::a
 PYBIND11_MODULE(_tomato, m) {
   m.doc() = "Internals of tomato clustering";
   m.def("doit", &plouf, "does the clustering");
-  //m.def("doit2", &plaf, "does the clustering faster");
+  // m.def("doit2", &plaf, "does the clustering faster");
   m.def("merge", &merge, "merge clusters");
 }
 
diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
index 467afe0e..19d6600f 100644
--- a/src/python/gudhi/clustering/tomato.py
+++ b/src/python/gudhi/clustering/tomato.py
@@ -29,7 +29,18 @@ class Tomato:
     params_: dict
         Parameters like input_type, metric, etc
     """
-    def __init__(self, input_type='points', metric=None, graph_type=None, density_type='manual', n_clusters=None, merge_threshold=None, eliminate_threshold=None, **params):
+
+    def __init__(
+        self,
+        input_type="points",
+        metric=None,
+        graph_type=None,
+        density_type="manual",
+        n_clusters=None,
+        merge_threshold=None,
+        eliminate_threshold=None,
+        **params
+    ):
         """
         Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later.
 
@@ -72,179 +83,207 @@ class Tomato:
         # TODO: First detect if this is a new call with the same data (only threshold changed?)
         # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters.
         if weights:
-            density_type = 'manual'
+            density_type = "manual"
         else:
             density_type = self.density_type_
-            assert density_type != 'manual'
-            if density_type == 'manual':
+            assert density_type != "manual"
+            if density_type == "manual":
                 raise ValueError("If density_type is 'manual', you must provide weights to fit()")
 
-        if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'radius':
+        if self.input_type_ == "distance_matrix" and self.graph_type_ == "radius":
             X = numpy.array(X)
-            r = self.params_['r']
+            r = self.params_["r"]
             self.neighbors_ = [numpy.nonzero(l <= r) for l in X]
 
-        if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'knn':
-            k = self.params_['k']
-            self.neighbors_ = numpy.argpartition(X, k-1)[:,0:k]
+        if self.input_type_ == "distance_matrix" and self.graph_type_ == "knn":
+            k = self.params_["k"]
+            self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k]
 
-        if self.input_type_ == 'neighbors':
+        if self.input_type_ == "neighbors":
             self.neighbors_ = X
-            assert density_type == 'manual'
+            assert density_type == "manual"
 
-        if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ in {'DTM', 'logDTM'}:
+        if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}:
             self.points_ = X
-            q = self.params_.get('p_DTM', 2)
-            p = self.params_.get('p', 2)
-            k = self.params_.get('k', 10)
-            k_DTM = self.params_.get('k_DTM', k)
+            q = self.params_.get("p_DTM", 2)
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            k_DTM = self.params_.get("k_DTM", k)
             kmax = max(k, k_DTM)
-            if self.params_.get('gpu'):
+            if self.params_.get("gpu"):
                 import torch
                 from pykeops.torch import LazyTensor
+
                 # 'float64' is slow except on super expensive GPUs. Allow it with some param?
                 XX = torch.tensor(self.points_, dtype=torch.float32)
                 if p == numpy.inf:
                     # Requires a version of pykeops strictly more recent than 1.3
-                    dd, nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin_argKmin(kmax, dim=1)
-                elif p == 2: # Any even integer?
-                    dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin_argKmin(kmax, dim=1)
+                    dd, nn = (
+                        (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :]))
+                        .abs()
+                        .max(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
+                elif p == 2:  # Any even integer?
+                    dd, nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p)
+                        .sum(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
                 else:
-                    dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin_argKmin(kmax, dim=1)
+                    dd, nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p)
+                        .sum(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
                 if k < kmax:
                     nn = nn[:, 0:k]
                 if k_DTM < kmax:
                     dd = dd[:, 0:k_DTM]
-                assert q != numpy.inf # for now
+                assert q != numpy.inf  # for now
                 if p != numpy.inf:
                     qp = float(q) / p
                 else:
                     qp = q
                 if qp != 1:
-                    dd = dd**qp
+                    dd = dd ** qp
                 weights = dd.sum(-1)
                 # **1/q is a waste of time, whether we take another **-.25 or a log
 
                 # Back to the CPU. Not sure this is necessary, or the right way to do it.
                 weights = numpy.array(weights)
                 self.neighbors_ = numpy.array(nn)
-            else: # CPU
+            else:  # CPU
                 from scipy.spatial import cKDTree
+
                 kdtree = cKDTree(self.points_)
-                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
                 dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs)
                 if k < kmax:
                     self.neighbors_ = self.neighbors_[:, 0:k]
                 if k_DTM < kmax:
                     dd = dd[:, 0:k_DTM]
-                #weights = numpy.linalg.norm(dd, axis=1, ord=q)
-                weights = (dd**q).sum(-1)
+                # weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd ** q).sum(-1)
 
             # TODO: check the formula in Fred's paper
-            if self.density_type_ == 'DTM':
-                weights = weights ** (-.25 / q)
+            if self.density_type_ == "DTM":
+                weights = weights ** (-0.25 / q)
             else:
                 weights = -numpy.log(weights)
 
-        if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ not in {'DTM', 'logDTM'}:
+        if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}:
             self.points_ = X
-            p = self.params_.get('p', 2)
-            k = self.params_.get('k', 10)
-            if self.params_.get('gpu'):
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            if self.params_.get("gpu"):
                 import torch
                 from pykeops.torch import LazyTensor
+
                 # 'float64' is slow except on super expensive GPUs. Allow it with some param?
                 XX = torch.tensor(self.points_, dtype=torch.float32)
                 if p == numpy.inf:
                     # Requires a version of pykeops strictly more recent than 1.3
-                    nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).argKmin(k, dim=1)
-                elif p == 2: # Any even integer?
-                    nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).argKmin(k, dim=1)
+                    nn = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).argKmin(k, dim=1)
+                elif p == 2:  # Any even integer?
+                    nn = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).argKmin(k, dim=1)
                 else:
-                    nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).argKmin(k, dim=1)
+                    nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p).sum(-1).argKmin(k, dim=1)
+                    )
                 # Back to the CPU. Not sure this is necessary, or the right way to do it.
                 self.neighbors_ = numpy.array(nn)
-            else: # CPU
+            else:  # CPU
                 from scipy.spatial import cKDTree
+
                 kdtree = cKDTree(self.points_)
                 # FIXME 'p'
-                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
                 _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs)
 
-        if self.input_type_ == 'points' and self.graph_type_ != 'knn' and self.density_type_ in {'DTM', 'logDTM'}:
+        if self.input_type_ == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}:
             self.points_ = X
-            q = self.params_.get('p_DTM', 2)
-            p = self.params_.get('p', 2)
-            k = self.params_.get('k', 10)
-            k_DTM = self.params_.get('k_DTM', k)
-            if self.params_.get('gpu'):
+            q = self.params_.get("p_DTM", 2)
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            k_DTM = self.params_.get("k_DTM", k)
+            if self.params_.get("gpu"):
                 import torch
                 from pykeops.torch import LazyTensor
+
                 # 'float64' is slow except on super expensive GPUs. Allow it with some param?
                 XX = torch.tensor(self.points_, dtype=torch.float32)
                 if p == numpy.inf:
-                    assert False # Not supported???
-                    dd = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin(k_DTM, dim=1)
-                elif p == 2: # Any even integer?
-                    dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin(k_DTM, dim=1)
+                    assert False  # Not supported???
+                    dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1)
+                elif p == 2:  # Any even integer?
+                    dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1)
                 else:
-                    dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin(k_DTM, dim=1)
-                assert q != numpy.inf # for now
+                    dd = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p)
+                        .sum(-1)
+                        .Kmin(k_DTM, dim=1)
+                    )
+                assert q != numpy.inf  # for now
                 if p != numpy.inf:
                     qp = float(q) / p
                 else:
                     qp = q
                 if qp != 1:
-                    dd = dd**qp
+                    dd = dd ** qp
                 weights = dd.sum(-1)
                 # **1/q is a waste of time, whether we take another **-.25 or a log
 
                 # Back to the CPU. Not sure this is necessary, or the right way to do it.
                 weights = numpy.array(weights)
-            else: # CPU
+            else:  # CPU
                 from scipy.spatial import cKDTree
+
                 kdtree = cKDTree(self.points_)
-                qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}}
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
                 dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs)
-                #weights = numpy.linalg.norm(dd, axis=1, ord=q)
-                weights = (dd**q).sum(-1)
+                # weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd ** q).sum(-1)
 
             # TODO: check the formula in Fred's paper
-            if self.density_type_ == 'DTM':
-                weights = weights ** (-.25 / q)
+            if self.density_type_ == "DTM":
+                weights = weights ** (-0.25 / q)
             else:
                 weights = -numpy.log(weights)
 
-        if self.input_type_ == 'distance_matrix' and self.density_type_ in {'DTM', 'logDTM'}:
-            q = self.params_.get('p_DTM', 2)
+        if self.input_type_ == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}:
+            q = self.params_.get("p_DTM", 2)
             X = numpy.array(X)
-            k = self.params_.get('k_DTM')
+            k = self.params_.get("k_DTM")
             if not k:
-                k = self.params_['k']
-            q = self.params_.get('p_DTM', 2)
-            weights = (numpy.partition(X)**q, k-1).sum(-1)
+                k = self.params_["k"]
+            q = self.params_.get("p_DTM", 2)
+            weights = (numpy.partition(X) ** q, k - 1).sum(-1)
             # TODO: check the formula in Fred's paper
-            if self.density_type_ == 'DTM':
-                weights = weights ** (-.25 / q)
+            if self.density_type_ == "DTM":
+                weights = weights ** (-0.25 / q)
             else:
                 weights = -numpy.log(weights)
 
-        if self.density_type_ == 'kde':
+        if self.density_type_ == "kde":
             # FIXME: replace most assert with raise ValueError("blabla")
-            assert self.input_type_ == 'points'
-            kde_params = self.params_.get('kde_params', dict())
+            assert self.input_type_ == "points"
+            kde_params = self.params_.get("kde_params", dict())
             from sklearn.neighbors import KernelDensity
+
             weights = KernelDensity(**kde_params).fit(X).score_samples(X)
 
-        if self.params_.get('symmetrize_graph'):
+        if self.params_.get("symmetrize_graph"):
             self.neighbors_ = [set(line) for line in self.neighbors_]
-            for i,line in enumerate(self.neighbors_):
+            for i, line in enumerate(self.neighbors_):
                 line.discard(i)
                 for j in line:
                     self.neighbors_[j].add(i)
 
-        self.weights_=weights # TODO remove
-        self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(list(self.neighbors_), weights)
+        self.weights_ = weights  # TODO remove
+        self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(
+            list(self.neighbors_), weights
+        )
         self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_)
         assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_)
         if self.__n_clusters:
@@ -265,65 +304,82 @@ class Tomato:
         """
         """
         import matplotlib.pyplot as plt
-        plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro')
-        l = self.diagram_[:,1].min()
-        r = max(self.diagram_[:,0].max(), self.max_density_per_cc_.max())
-        plt.plot([l,r],[l,r])
-        plt.plot(self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape,1.1*l-.1*r),'ro',color='green')
+
+        plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro")
+        l = self.diagram_[:, 1].min()
+        r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max())
+        plt.plot([l, r], [l, r])
+        plt.plot(
+            self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green"
+        )
         plt.show()
 
-#    def predict(self, X):
-#        # X had better be the same as in fit()
-#        return labels_
+    #    def predict(self, X):
+    #        # X had better be the same as in fit()
+    #        return labels_
 
     # Use set_params instead?
     @property
     def n_clusters_(self):
         return self.__n_clusters
+
     @n_clusters_.setter
     def n_clusters_(self, n_clusters):
         if n_clusters == self.__n_clusters:
             return
         self.__n_clusters = n_clusters
         self.__merge_threshold = None
-        if hasattr(self, 'leaf_labels_'):
+        if hasattr(self, "leaf_labels_"):
             renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters)
             self.labels_ = renaming[self.leaf_labels_]
 
     @property
     def merge_threshold_(self):
         return self.__merge_threshold
+
     @merge_threshold_.setter
     def merge_threshold_(self, merge_threshold):
         if merge_threshold == self.__merge_threshold:
             return
-        if hasattr(self, 'leaf_labels_'):
-            self.n_clusters_ = numpy.count_nonzero(self.diagram_[1]-self.diagram_[0] > merge_threshold) + len(max_density_per_cc_)
+        if hasattr(self, "leaf_labels_"):
+            self.n_clusters_ = numpy.count_nonzero(self.diagram_[1] - self.diagram_[0] > merge_threshold) + len(
+                max_density_per_cc_
+            )
         else:
             self.__n_clusters = None
         self.__merge_threshold = merge_threshold
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import sys
-    a=[(1,2),(1.1,1.9),(.9,1.8),(10,0),(10.1,.05),(10.2,-.1),(5.4,0)]
-    a=numpy.random.rand(500,2)
-    t=Tomato(input_type='points', metric='Euclidean', graph_type='knn', density_type='DTM', n_clusters=2, k=4, n_jobs=-1, eps=.05)
+
+    a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)]
+    a = numpy.random.rand(500, 2)
+    t = Tomato(
+        input_type="points",
+        metric="Euclidean",
+        graph_type="knn",
+        density_type="DTM",
+        n_clusters=2,
+        k=4,
+        n_jobs=-1,
+        eps=0.05,
+    )
     t.fit(a)
-    #print("neighbors\n",t.neighbors_)
-    #print()
-    #print("weights\n",t.weights_)
-    #print()
-    #print("diagram\n",t.diagram_)
-    #print()
-    print("max\n",t.max_density_per_cc_, file=sys.stderr)
-    #print()
-    print("leaf labels\n",t.leaf_labels_)
-    #print()
-    print("labels\n",t.labels_)
-    #print()
-    print("children\n",t.children_)
-    #print()
+    # print("neighbors\n",t.neighbors_)
+    # print()
+    # print("weights\n",t.weights_)
+    # print()
+    # print("diagram\n",t.diagram_)
+    # print()
+    print("max\n", t.max_density_per_cc_, file=sys.stderr)
+    # print()
+    print("leaf labels\n", t.leaf_labels_)
+    # print()
+    print("labels\n", t.labels_)
+    # print()
+    print("children\n", t.children_)
+    # print()
     t.n_clusters_ = 2
-    print("labels\n",t.labels_)
+    print("labels\n", t.labels_)
     t.plot_diagram()
author	Marc Glisse <marc.glisse@inria.fr>	2020-02-26 23:24:42 +0100
committer	Marc Glisse <marc.glisse@inria.fr>	2020-02-26 23:24:42 +0100
commit	e5e0f9a9e96389eadc9e9c4bc493b88abcb6f89a (patch)
tree	44dda3c7a0cf6992d8caaed1cf2ee7559b06975d /src/python/gudhi/clustering
parent	53579deb2d551752b503b7b76ac04885ec354470 (diff)