13 files changed, 820 insertions, 1 deletions
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 976a8b52..02fe6c50 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -36,6 +36,7 @@ if(PYTHONINTERP_FOUND)
     add_gudhi_debug_info("Pybind11 version ${PYBIND11_VERSION}")
     set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'bottleneck', ")
     set(GUDHI_PYTHON_MODULES_EXTRA "${GUDHI_PYTHON_MODULES_EXTRA}'hera', ")
+    set(GUDHI_PYTHON_MODULES_EXTRA "${GUDHI_PYTHON_MODULES_EXTRA}'clustering', ")
   endif()
   if(CYTHON_FOUND)
     set(GUDHI_PYTHON_MODULES "${GUDHI_PYTHON_MODULES}'off_reader', ")
@@ -129,6 +130,7 @@ if(PYTHONINTERP_FOUND)
     set(GUDHI_CYTHON_MODULES "${GUDHI_CYTHON_MODULES}'witness_complex', ")
     set(GUDHI_CYTHON_MODULES "${GUDHI_CYTHON_MODULES}'strong_witness_complex', ")
     set(GUDHI_PYBIND11_MODULES "${GUDHI_PYBIND11_MODULES}'hera', ")
+    set(GUDHI_PYBIND11_MODULES "${GUDHI_PYBIND11_MODULES}'clustering/_tomato', ")
     if (NOT CGAL_VERSION VERSION_LESS 4.11.0)
       set(GUDHI_PYBIND11_MODULES "${GUDHI_PYBIND11_MODULES}'bottleneck', ")
       set(GUDHI_CYTHON_MODULES "${GUDHI_CYTHON_MODULES}'nerve_gic', ")
@@ -232,6 +234,7 @@ if(PYTHONINTERP_FOUND)
     file(COPY "gudhi/representations" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi/")
     file(COPY "gudhi/wasserstein" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi")
     file(COPY "gudhi/point_cloud" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi")
+    file(COPY "gudhi/clustering" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/gudhi" FILES_MATCHING PATTERN "*.py")
 
     add_custom_command(
         OUTPUT gudhi.so
diff --git a/src/python/doc/clustering.inc b/src/python/doc/clustering.inc
new file mode 100644
index 00000000..b845e1a7
--- /dev/null
+++ b/src/python/doc/clustering.inc
@@ -0,0 +1,12 @@
+.. table::
+   :widths: 30 50 20
+
+   +-------------------------+------------------------------------------------------------------------+---------------------------------+
+   |                         | Clustering tools.                                                      | :Author: Marc Glisse            |
+   |                         |                                                                        |                                 |
+   |                         |                                                                        | :Introduced in: GUDHI 3.2.0     |
+   |                         |                                                                        |                                 |
+   |                         |                                                                        | :Copyright: MIT                 |
+   +-------------------------+------------------------------------------------------------------------+---------------------------------+
+   | * :doc:`clustering`                                                                                                                |
+   +-------------------------+----------------------------------------------------------------------------------------------------------+
diff --git a/src/python/doc/clustering.rst b/src/python/doc/clustering.rst
new file mode 100644
index 00000000..dc9de968
--- /dev/null
+++ b/src/python/doc/clustering.rst
@@ -0,0 +1,73 @@
+:orphan:
+
+.. To get rid of WARNING: document isn't included in any toctree
+
+=================
+Clustering manual
+=================
+
+We provide an implementation of ToMATo :cite:`tomato`, a persistence-based clustering algorithm. In short, this algorithm uses a density estimator and a neighborhood graph, starts with a mode-seeking phase (naive hill-climbing) to build initial clusters, and finishes by merging clusters based on their prominence.
+
+The merging phase depends on a parameter, which is the minimum prominence a cluster needs to avoid getting merged into another, bigger cluster. This parameter determines the number of clusters, and for convenience we allow you to choose instead the number of clusters. Decreasing the prominence threshold defines a hierarchy of clusters: if 2 points are in separate clusters when we have k clusters, they are still in different clusters for k+1 clusters.
+
+As a by-product, we produce the persistence diagram of the merge tree of the initial clusters. This is a convenient graphical tool to help decide how many clusters we want.
+
+.. code-block::
+
+   import gudhi
+   f = open(gudhi.__root_source_dir__ + '/data/points/spiral_2d.csv', 'r')
+   import numpy as np
+   data = np.loadtxt(f)
+   import matplotlib.pyplot as plt
+   plt.scatter(data[:,0],data[:,1],marker='.',s=1)
+   plt.show()
+
+.. image:: img/spiral.png
+
+.. code-block::
+
+   from gudhi.clustering.tomato import Tomato
+   t = Tomato()
+   t.fit(data)
+   t.plot_diagram()
+
+.. image:: img/spiral-diag.png
+
+As one can see in `t.n_clusters_`, the algorithm found 6316 initial clusters. The diagram shows their prominence as their distance to the diagonal. There is always one point infinitely far: there is at least one cluster. Among the others, one point seems significantly farther from the diagonal than the others, which indicates that splitting the points into 2 clusters may be a sensible idea.
+
+.. code-block::
+
+   t.n_clusters_=2
+   plt.scatter(data[:,0],data[:,1],marker='.',s=1,c=["r" if i else "b" for i in t.labels_])
+   plt.show()
+
+.. image:: img/spiral-color.png
+
+Of course this is just the result for one set of parameters. We can ask for a different density estimator and a different neighborhood graph computed with different parameters.
+
+.. code-block::
+
+   t = Tomato(density_type='DTM', k=100)
+   t.fit(data)
+   t.plot_diagram()
+
+Makes the number of clusters clearer, and changes a bit the shape of the clusters.
+
+.. image:: img/spiral-diag2.png
+
+A quick look at the corresponding density estimate (`weights_` is not officially supported)
+
+.. code-block::
+
+   plt.scatter(data[:,0],data[:,1],marker='.',s=1,c=t.weights_)
+   plt.show()
+
+.. image:: img/spiral-density.png
+
+The code provides a few density estimators and graph constructions for convenience when first experimenting, but it is actually expected that advanced users provide their own graph and density estimates instead of point coordinates.
+
+Since the algorithm essentially computes basins of attraction, it is also encouraged to use it on functions that do not represent densities at all.
+
+.. autoclass:: gudhi.clustering.tomato.Tomato
+   :members:
+   :special-members: __init__
diff --git a/src/python/doc/img/spiral-color.png b/src/python/doc/img/spiral-color.png
new file mode 100644
index 00000000..21b62dfc
--- /dev/null
+++ b/src/python/doc/img/spiral-color.png
diff --git a/src/python/doc/img/spiral-density.png b/src/python/doc/img/spiral-density.png
new file mode 100644
index 00000000..a36cc882
--- /dev/null
+++ b/src/python/doc/img/spiral-density.png
diff --git a/src/python/doc/img/spiral-diag.png b/src/python/doc/img/spiral-diag.png
new file mode 100644
index 00000000..018ef88d
--- /dev/null
+++ b/src/python/doc/img/spiral-diag.png
diff --git a/src/python/doc/img/spiral-diag2.png b/src/python/doc/img/spiral-diag2.png
new file mode 100644
index 00000000..fc5f2100
--- /dev/null
+++ b/src/python/doc/img/spiral-diag2.png
diff --git a/src/python/doc/img/spiral.png b/src/python/doc/img/spiral.png
new file mode 100644
index 00000000..2d34a523
--- /dev/null
+++ b/src/python/doc/img/spiral.png
diff --git a/src/python/doc/index.rst b/src/python/doc/index.rst
index 13e51047..56c58e0d 100644
--- a/src/python/doc/index.rst
+++ b/src/python/doc/index.rst
@@ -86,3 +86,8 @@ Point cloud utilities
 *********************
 
 .. include:: point_cloud_sum.inc
+
+Clustering
+**********
+
+.. include:: clustering.inc
diff --git a/src/python/gudhi/clustering/__init__.py b/src/python/gudhi/clustering/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/python/gudhi/clustering/__init__.py
diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc
new file mode 100644
index 00000000..87bd62e9
--- /dev/null
+++ b/src/python/gudhi/clustering/_tomato.cc
@@ -0,0 +1,307 @@
+// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix`
+#include <boost/container/flat_map.hpp>
+#include <boost/pending/disjoint_sets.hpp>
+#include <boost/property_map/property_map.hpp>
+#include <boost/property_map/transform_value_property_map.hpp>
+#include <boost/property_map/vector_property_map.hpp>
+#include <boost/property_map/function_property_map.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <vector>
+#include <unordered_map>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <class T, class = std::enable_if_t<std::is_integral<T>::value>>
+int getint(int i) {
+  return i;
+}
+// Gcc-8 has a bug that breaks this version, fixed in gcc-9
+// template<class T, class=decltype(std::declval<T>().template cast<int>())>
+// int getint(T i){return i.template cast<int>();}
+template <class T>
+auto getint(T i) -> decltype(i.template cast<int>()) {
+  return i.template cast<int>();
+}
+
+// Raw clusters are clusters obtained through single-linkage, no merging.
+
+typedef int Point_index;
+typedef int Cluster_index;
+struct Merge {
+  Cluster_index first, second;
+  double persist;
+};
+
+template <class Neighbors, class Density, class Order, class ROrder>
+auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order,
+            ROrder const& rorder) {
+  // point index --> index of raw cluster it belongs to
+  std::vector<Cluster_index> raw_cluster;
+  raw_cluster.reserve(num_points);
+  // cluster index --> index of top point in the cluster
+  Cluster_index n_raw_clusters = 0;  // current number of raw clusters seen
+  //
+  std::vector<Merge> merges;
+  struct Data {
+    Cluster_index parent;
+    int rank;
+    Point_index max;
+  };  // information on a cluster
+  std::vector<Data> ds_base;
+  // boost::vector_property_map does resize(size+1) for every new element, don't use it
+  auto ds_data =
+      boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n) -> Data& { return ds_base[n]; });
+  auto ds_parent =
+      boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data);
+  auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data);
+  boost::disjoint_sets<decltype(ds_rank), decltype(ds_parent)> ds(
+      ds_rank, ds_parent);                         // on the clusters, not directly the points
+  std::vector<std::array<double, 2>> persistence;  // diagram (finite points)
+  boost::container::flat_map<Cluster_index, Cluster_index>
+      adj_clusters;  // first: the merged cluster, second: the raw cluster
+  // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only
+  // insert in the vector if merged is absent from the set
+
+  for (Point_index i = 0; i < num_points; ++i) {
+    // auto&& ngb = neighbors[order[i]];
+    // TODO: have a specialization where we don't need python types and py::cast
+    // TODO: move py::cast and getint into Neighbors
+    py::object ngb = neighbors[py::cast(order[i])];  // auto&& also seems to work
+    adj_clusters.clear();
+    Point_index j = i;  // highest neighbor
+    // for(Point_index k : ngb)
+    for (auto k_py : ngb) {
+      Point_index k = rorder[getint(k_py)];
+      if (k >= i || k < 0)  // ???
+        continue;
+      if (k < j) j = k;
+      Cluster_index rk = raw_cluster[k];
+      adj_clusters.emplace(ds.find_set(rk), rk);
+      // does not insert if ck=ds.find_set(rk) already seen
+      // which rk we keep from those with the same ck is arbitrary
+    }
+    assert((Point_index)raw_cluster.size() == i);
+    if (i == j) {  // local maximum -> new cluster
+      Cluster_index c = n_raw_clusters++;
+      ds_base.emplace_back();  // could be integrated in ds_data, but then we would check the size for every access
+      ds.make_set(c);
+      ds_base[c].max = i;  // max
+      raw_cluster.push_back(c);
+      continue;
+    } else {  // add i to the cluster of j
+      assert(j < i);
+      raw_cluster.push_back(raw_cluster[j]);
+      // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may
+      // merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems.
+      // We could just not set j at all and use arbitrarily the first element of adj_clusters.
+    }
+    // possibly merge clusters
+    // we could sort, in case there are several merges, but that doesn't seem so useful
+    Cluster_index rj = raw_cluster[j];
+    Cluster_index cj = ds.find_set(rj);
+    Cluster_index orig_cj = cj;
+    for (auto ckk : adj_clusters) {
+      Cluster_index rk = ckk.second;
+      Cluster_index ck = ckk.first;
+      if (ck == orig_cj) continue;
+      assert(ck == ds.find_set(rk));
+      Point_index j = ds_base[cj].max;
+      Point_index k = ds_base[ck].max;
+      Point_index young = std::max(j, k);
+      Point_index old = std::min(j, k);
+      auto d_young = density[order[young]];
+      auto d_i = density[order[i]];
+      assert(d_young >= d_i);
+      // Always merge (the non-hierarchical algorithm would only conditionally merge here
+      persistence.push_back({d_young, d_i});
+      assert(ds.find_set(rj) != ds.find_set(rk));
+      ds.link(cj, ck);
+      cj = ds.find_set(cj);
+      ds_base[cj].max = old;  // just one parent, no need for find_set
+      // record the raw clusters, we don't know what will have already been merged.
+      merges.push_back({rj, rk, d_young - d_i});
+    }
+  }
+  {
+    boost::counting_iterator<int> b(0), e(ds_base.size());
+    ds.compress_sets(b, e);
+    // Now we stop using find_sets and look at the parent directly
+    // rank is reused to rename clusters contiguously 0, 1, etc
+  }
+  // Maximum for each connected component
+  std::vector<double> max_cc;
+  // for(Point_index i = 0; i < num_points; ++i){
+  //  if(ds_base[ds_base[raw_cluster[i]].parent].max == i)
+  //    max_cc.push_back(density[order[i]]);
+  //}
+  for (Cluster_index i = 0; i < n_raw_clusters; ++i) {
+    if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]);
+  }
+  assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters);
+
+  // TODO: create a "noise" cluster, merging all those not prominent enough?
+  // int nb_clusters=0;
+  // for(int i=0;i<(int)ds_base.size();++i){
+  //  if(ds_parent[i]!=i) continue;
+  //  ds_data[i].rank=nb_clusters++;
+  //}
+
+  // Replay the merges, in increasing order of prominence, to build the hierarchy
+  std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; });
+  std::vector<std::array<Cluster_index, 2>> children;
+  children.reserve(merges.size());
+  {
+    struct Dat {
+      Cluster_index parent;
+      int rank;
+      Cluster_index name;
+    };
+    std::vector<Dat> ds_bas(2 * n_raw_clusters - 1);
+    Cluster_index i;
+    auto ds_dat =
+        boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+    auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+    auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+    boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
+    for (i = 0; i < n_raw_clusters; ++i) {
+      ds.make_set(i);
+      ds_bas[i].name = i;
+    }
+    for (Merge const& m : merges) {
+      Cluster_index j = ds.find_set(m.first);
+      Cluster_index k = ds.find_set(m.second);
+      assert(j != k);
+      children.push_back({ds_bas[j].name, ds_bas[k].name});
+      ds.make_set(i);
+      ds.link(i, j);
+      ds.link(i, k);
+      ds_bas[ds.find_set(i)].name = i;
+      ++i;
+    }
+  }
+
+  std::vector<Cluster_index> raw_cluster_ordered(num_points);
+  for (int i = 0; i < num_points; ++i) raw_cluster_ordered[i] = raw_cluster[rorder[i]];
+  // return raw_cluster, children, persistence
+  // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042
+  return py::make_tuple(py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
+                        py::array(children.size(), children.data()), py::array(persistence.size(), persistence.data()),
+                        py::array(max_cc.size(), max_cc.data()));
+}
+
+auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final) {
+  // Should this really be an error?
+  if (n_final > n_leaves)
+    throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters");
+  py::buffer_info cbuf = children.request();
+  if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0))
+    throw std::runtime_error("internal error: children have to be (n,2) or empty");
+  const int n_merges = cbuf.shape[0];
+  Cluster_index* d = (Cluster_index*)cbuf.ptr;
+  // Should this really be an error?
+  // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n';
+  if (n_merges + n_final < n_leaves)
+    throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) +
+                             " is smaller than the number of connected components " +
+                             std::to_string(n_leaves - n_merges));
+
+  struct Dat {
+    Cluster_index parent;
+    int rank;
+    int name;
+  };
+  std::vector<Dat> ds_bas(2 * n_leaves - 1);
+  auto ds_dat = boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
+  auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
+  auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
+  boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
+  Cluster_index i;
+  for (i = 0; i < n_leaves; ++i) {
+    ds.make_set(i);
+    ds_bas[i].name = -1;
+  }
+  for (Cluster_index m = 0; m < n_leaves - n_final; ++m) {
+    Cluster_index j = ds.find_set(d[2 * m]);
+    Cluster_index k = ds.find_set(d[2 * m + 1]);
+    assert(j != k);
+    ds.make_set(i);
+    ds.link(i, j);
+    ds.link(i, k);
+    ++i;
+  }
+  Cluster_index next_cluster_name = 0;
+  std::vector<Cluster_index> ret;
+  ret.reserve(n_leaves);
+  for (Cluster_index j = 0; j < n_leaves; ++j) {
+    Cluster_index k = ds.find_set(j);
+    if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
+    ret.push_back(ds_bas[k].name);
+  }
+  return py::array(ret.size(), ret.data());
+}
+
+// Do a special version when ngb is a numpy array, where we can cast to int[k][n] ?
+auto plouf(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density) {
+  // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted
+  // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to []
+  // (I _think_ handle is ok and we don't need object here)
+  py::buffer_info wbuf = density.request();
+  if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D");
+  const int n = wbuf.shape[0];
+  double* d = (double*)wbuf.ptr;
+  // Vector { 0, 1, ..., n-1 }
+  std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n));
+  // Permutation of the indices to get points in decreasing order of density
+  std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; });
+  // Inverse permutation
+  std::vector<Point_index> rorder(n);
+  for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
+  // Used as:
+  // order[i] is the index of the point with i-th largest density
+  // rorder[i] is the rank of the i-th point in order of decreasing density
+  // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the
+  // output)
+  return tomato(n, ngb, d, order, rorder);
+}
+#if 0
+struct Vec {
+  int const* base;
+  int k;
+  int const* begin()const{return base;}
+  int const* end()const{return base+k;}
+  Vec operator*()const{return *this;}
+};
+auto plaf(py::array_t<int, py::array::c_style | py::array::forcecast> ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density, double threshold){
+  py::buffer_info nbuf = ngb.request();
+  if(nbuf.ndim!=2)
+    throw std::runtime_error("neighbors must be 2D");
+  const int n=nbuf.shape[0];
+  const int k=nbuf.shape[1];
+  int*nptr=(int*)nbuf.ptr;
+  auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};});
+  py::buffer_info wbuf = density.request();
+  if(wbuf.ndim!=1)
+    throw std::runtime_error("density must be 1D");
+  if(n!=wbuf.shape[0])
+    throw std::runtime_error("incompatible sizes");
+  double*d=(double*)wbuf.ptr;
+  return tomato(n,neighbors,d);
+}
+#endif
+PYBIND11_MODULE(_tomato, m) {
+  m.doc() = "Internals of tomato clustering";
+  m.def("doit", &plouf, "does the clustering");
+  // m.def("doit2", &plaf, "does the clustering faster");
+  m.def("merge", &merge, "merge clusters");
+}
+
+// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array
+//
+// py::isinstance<py::array_t<std::int32_t>> (ou py::isinstance<py::array> et tester dtype) et flags&c_style
+// ou overload (en virant forcecast?)
+// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ?
diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py
new file mode 100644
index 00000000..1a2887bc
--- /dev/null
+++ b/src/python/gudhi/clustering/tomato.py
@@ -0,0 +1,419 @@
+import numpy
+from ._tomato import *
+
+# The fit/predict interface is not so well suited...
+# TODO: option for a faster, weaker (probabilistic) knn
+
+
+class Tomato:
+    """
+    Clustering
+
+    This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet.
+
+    Attributes
+    ----------
+    n_clusters_: int
+        The number of clusters. Writing to it automatically adjusts labels_.
+    merge_threshold_: float
+        minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_.
+    n_leaves_: int
+        number of leaves (unstable clusters) in the hierarchical tree
+    leaf_labels_: ndarray of shape (n_samples)
+        cluster labels for each point, at the very bottom of the hierarchy
+    labels_: ndarray of shape (n_samples)
+        cluster labels for each point, after merging
+    diagram_: ndarray of shape (n_leaves_,2)
+        persistence diagram (only the finite points)
+    children_: ndarray of shape (n_leaves_-1,2)
+        The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i
+    params_: dict
+        Parameters like input_type, metric, etc
+    """
+
+    # Not documented for now, because I am not sure how useful it is.
+    #    max_density_per_cc_: ndarray of shape (n_connected_components)
+    #        maximum of the density function on each connected component
+
+    def __init__(
+        self,
+        input_type="points",
+        metric=None,
+        graph_type="knn",
+        density_type="logDTM",
+        n_clusters=None,
+        merge_threshold=None,
+        #       eliminate_threshold=None,
+        #           eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated
+        **params
+    ):
+        """
+        Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later.
+
+        Args:
+            input_type (str): 'points', 'distance_matrix' or 'neighbors'.
+            metric (None|Callable): If None, use Minkowski of parameter p.
+            graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'.
+            density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'.
+            kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity.
+            k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10.
+            k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k.
+            r (float): size of a neighborhood if graph_type is 'radius'.
+            eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU).
+            gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million).
+            n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters.
+            merge_threshold (float): minimum prominence of a cluster so it doesn't get merged.
+            symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false.
+            p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2.
+            p_DTM (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'.
+            n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1.
+        """
+        # Should metric='precomputed' mean input_type='distance_matrix'?
+        # Should we be able to pass metric='minkowski' (what None does currently)?
+        self.input_type_ = input_type
+        self.metric_ = metric
+        self.graph_type_ = graph_type
+        self.density_type_ = density_type
+        self.params_ = params
+        self.__n_clusters = n_clusters
+        self.__merge_threshold = merge_threshold
+        # self.eliminate_threshold_ = eliminate_threshold
+        if n_clusters and merge_threshold:
+            raise ValueError("Cannot specify both a merge threshold and a number of clusters")
+
+    def fit(self, X, y=None, weights=None):
+        """
+        Args:
+            X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0), according to input_type.
+            weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point
+        """
+        # TODO: First detect if this is a new call with the same data (only threshold changed?)
+        # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters.
+        if weights is not None:
+            density_type = "manual"
+        else:
+            density_type = self.density_type_
+            assert density_type != "manual"
+            if density_type == "manual":
+                raise ValueError("If density_type is 'manual', you must provide weights to fit()")
+
+        input_type = self.input_type_
+        if input_type == "points":
+            self.points_ = X
+        if input_type == "points" and self.metric_:
+            from sklearn.metrics import pairwise_distances
+
+            X = pairwise_distances(X, metric=self.metric_, n_jobs=self.params_.get("n_jobs"))
+            input_type = "distance_matrix"
+
+        if input_type == "distance_matrix" and self.graph_type_ == "radius":
+            X = numpy.array(X)
+            r = self.params_["r"]
+            self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X]
+
+        if input_type == "distance_matrix" and self.graph_type_ == "knn":
+            k = self.params_["k"]
+            self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k]
+
+        if input_type == "neighbors":
+            self.neighbors_ = X
+            assert density_type == "manual"
+
+        if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}:
+            q = self.params_.get("p_DTM", len(X[0]))
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            k_DTM = self.params_.get("k_DTM", k)
+            kmax = max(k, k_DTM)
+            if self.params_.get("gpu"):
+                import torch
+                from pykeops.torch import LazyTensor
+
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    # Requires a version of pykeops strictly more recent than 1.3
+                    dd, nn = (
+                        (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :]))
+                        .abs()
+                        .max(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
+                elif p == 2:  # Any even integer?
+                    dd, nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p)
+                        .sum(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
+                else:
+                    dd, nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p)
+                        .sum(-1)
+                        .Kmin_argKmin(kmax, dim=1)
+                    )
+                if k < kmax:
+                    nn = nn[:, 0:k]
+                if k_DTM < kmax:
+                    dd = dd[:, 0:k_DTM]
+                assert q != numpy.inf  # for now
+                if p != numpy.inf:
+                    qp = float(q) / p
+                else:
+                    qp = q
+                if qp != 1:
+                    dd = dd ** qp
+                weights = dd.sum(-1)
+
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                weights = numpy.array(weights)
+                self.neighbors_ = numpy.array(nn)
+            else:  # CPU
+                from scipy.spatial import cKDTree
+
+                kdtree = cKDTree(self.points_)
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
+                dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs)
+                if k < kmax:
+                    self.neighbors_ = self.neighbors_[:, 0:k]
+                if k_DTM < kmax:
+                    dd = dd[:, 0:k_DTM]
+                # weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd ** q).sum(-1)
+
+            if self.density_type_ == "DTM":
+                # We ignore constant factors, which don't matter for
+                # clustering, although they do change thresholds
+                dim = len(self.points_[0])
+                weights = weights ** (-dim / q)
+            else:
+                # We ignore exponents, which become constant factors with log
+                weights = -numpy.log(weights)
+
+        if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}:
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            if self.params_.get("gpu"):
+                import torch
+                from pykeops.torch import LazyTensor
+
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    # Requires a version of pykeops strictly more recent than 1.3
+                    nn = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).argKmin(k, dim=1)
+                elif p == 2:  # Any even integer?
+                    nn = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).argKmin(k, dim=1)
+                else:
+                    nn = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p).sum(-1).argKmin(k, dim=1)
+                    )
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                self.neighbors_ = numpy.array(nn)
+            else:  # CPU
+                from scipy.spatial import cKDTree
+
+                kdtree = cKDTree(self.points_)
+                # FIXME 'p'
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
+                _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs)
+
+        if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}:
+            q = self.params_.get("p_DTM", len(X[0]))
+            p = self.params_.get("p", 2)
+            k = self.params_.get("k", 10)
+            k_DTM = self.params_.get("k_DTM", k)
+            if self.params_.get("gpu"):
+                import torch
+                from pykeops.torch import LazyTensor
+
+                # 'float64' is slow except on super expensive GPUs. Allow it with some param?
+                XX = torch.tensor(self.points_, dtype=torch.float32)
+                if p == numpy.inf:
+                    dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1)
+                elif p == 2:  # Any even integer?
+                    dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1)
+                else:
+                    dd = (
+                        ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p)
+                        .sum(-1)
+                        .Kmin(k_DTM, dim=1)
+                    )
+                assert q != numpy.inf  # for now
+                if p != numpy.inf:
+                    qp = float(q) / p
+                else:
+                    qp = q
+                if qp != 1:
+                    dd = dd ** qp
+                weights = dd.sum(-1)
+                # **1/q is a waste of time, whether we take another **-.25 or a log
+
+                # Back to the CPU. Not sure this is necessary, or the right way to do it.
+                weights = numpy.array(weights)
+            else:  # CPU
+                from scipy.spatial import cKDTree
+
+                kdtree = cKDTree(self.points_)
+                qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}}
+                dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs)
+                # weights = numpy.linalg.norm(dd, axis=1, ord=q)
+                weights = (dd ** q).sum(-1)
+
+            if self.density_type_ == "DTM":
+                dim = len(self.points_[0])
+                weights = weights ** (-dim / q)
+            else:
+                weights = -numpy.log(weights)
+
+        if input_type == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}:
+            q = self.params_.get("p_DTM", 2)
+            X = numpy.array(X)
+            k = self.params_.get("k_DTM")
+            if not k:
+                k = self.params_["k"]
+            weights = (numpy.partition(X, k - 1)[:, 0:k] ** q).sum(-1)
+            if self.density_type_ == "DTM":
+                try:
+                    dim = len(self.points_[0])
+                except AttributeError:
+                    dim = 2
+                weights = weights ** (-dim / q)
+            else:
+                weights = -numpy.log(weights)
+
+        if self.density_type_ in {"KDE", "logKDE"}:
+            # FIXME: replace most assert with raise ValueError("blabla")
+            # assert input_type == "points"
+            kde_params = self.params_.get("kde_params", dict())
+            from sklearn.neighbors import KernelDensity
+            weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_)
+            if self.density_type_ == "KDE":
+                weights = numpy.exp(weights)
+
+        if self.params_.get("symmetrize_graph"):
+            self.neighbors_ = [set(line) for line in self.neighbors_]
+            for i, line in enumerate(self.neighbors_):
+                line.discard(i)
+                for j in line:
+                    self.neighbors_[j].add(i)
+
+        self.weights_ = weights  # TODO remove
+        self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = doit(
+            list(self.neighbors_), weights
+        )
+        self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_)
+        assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_)
+        if self.__merge_threshold:
+            assert not self.__n_clusters
+            self.__n_clusters = numpy.count_nonzero(
+                self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold
+            ) + len(self.max_density_per_cc_)
+        if self.__n_clusters:
+            renaming = merge(self.children_, self.n_leaves_, self.__n_clusters)
+            self.labels_ = renaming[self.leaf_labels_]
+        else:
+            self.labels_ = self.leaf_labels_
+            self.__n_clusters = self.n_leaves_
+        return self
+
+    def fit_predict(self, X, y=None, weights=None):
+        """
+        Equivalent to fit(), and returns the `labels_`.
+        """
+        return self.fit(X, y, weights).labels_
+
+    # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k?
+    def plot_diagram(self):
+        """
+        """
+        import matplotlib.pyplot as plt
+
+        if self.diagram_.size > 0:
+            plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro")
+            l = self.diagram_[:, 1].min()
+            r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max())
+        else:
+            l = self.max_density_per_cc_.min()
+            r = self.max_density_per_cc_.max()
+            if l == r:
+                if l > 0:
+                    l, r = .9 * l, 1.1 * r
+                elif l < 0:
+                    l, r = 1.1 * l, .9 * r
+                else:
+                    l, r = -1., 1.
+        plt.plot([l, r], [l, r])
+        plt.plot(
+            self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green"
+        )
+        plt.show()
+
+    #    def predict(self, X):
+    #        # X had better be the same as in fit()
+    #        return self.labels_
+
+    # Use set_params instead?
+    @property
+    def n_clusters_(self):
+        return self.__n_clusters
+
+    @n_clusters_.setter
+    def n_clusters_(self, n_clusters):
+        if n_clusters == self.__n_clusters:
+            return
+        self.__n_clusters = n_clusters
+        self.__merge_threshold = None
+        if hasattr(self, "leaf_labels_"):
+            renaming = merge(self.children_, self.n_leaves_, self.__n_clusters)
+            self.labels_ = renaming[self.leaf_labels_]
+
+    @property
+    def merge_threshold_(self):
+        return self.__merge_threshold
+
+    @merge_threshold_.setter
+    def merge_threshold_(self, merge_threshold):
+        if merge_threshold == self.__merge_threshold:
+            return
+        if hasattr(self, "leaf_labels_"):
+            self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 0] - self.diagram_[:, 1] > merge_threshold) + len(
+                self.max_density_per_cc_
+            )
+        else:
+            self.__n_clusters = None
+        self.__merge_threshold = merge_threshold
+
+
+if __name__ == "__main__":
+    import sys
+
+    a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)]
+    a = numpy.random.rand(500, 2)
+    t = Tomato(
+        input_type="points",
+        metric="Euclidean",
+        graph_type="knn",
+        density_type="DTM",
+        n_clusters=2,
+        k=4,
+        n_jobs=-1,
+        eps=0.05,
+    )
+    t.fit(a)
+    # print("neighbors\n",t.neighbors_)
+    # print()
+    # print("weights\n",t.weights_)
+    # print()
+    # print("diagram\n",t.diagram_)
+    # print()
+    print("max\n", t.max_density_per_cc_, file=sys.stderr)
+    # print()
+    print("leaf labels\n", t.leaf_labels_)
+    # print()
+    print("labels\n", t.labels_)
+    # print()
+    print("children\n", t.children_)
+    # print()
+    t.n_clusters_ = 2
+    print("labels\n", t.labels_)
+    t.plot_diagram()
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
index b9f4e3f0..5e26079a 100644
--- a/src/python/setup.py.in
+++ b/src/python/setup.py.in
@@ -51,7 +51,7 @@ for module in pybind11_modules:
     if module == 'hera':
         my_include_dirs = ['@HERA_WASSERSTEIN_INCLUDE_DIR@'] + my_include_dirs
     ext_modules.append(Extension(
-        'gudhi.' + module,
+        'gudhi.' + module.replace('/', '.'),
         sources = [source_dir + module + '.cc'],
         language = 'c++',
         include_dirs = my_include_dirs,