src/python/gudhi/clustering/_tomato.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

/*    This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
 *    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
 *    Author(s):       Marc Glisse
 *
 *    Copyright (C) 2020 Inria
 *
 *    Modification(s):
 *      - YYYY/MM Author: Description of the modification
 */

#include <boost/container/flat_map.hpp>
#include <boost/pending/disjoint_sets.hpp>
#include <boost/property_map/property_map.hpp>
#include <boost/property_map/transform_value_property_map.hpp>
#include <boost/property_map/vector_property_map.hpp>
#include <boost/property_map/function_property_map.hpp>
#include <boost/iterator/counting_iterator.hpp>
#include <boost/range/irange.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <vector>
#include <unordered_map>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <iostream>

namespace py = pybind11;

template <class T, class = std::enable_if_t<std::is_integral<T>::value>>
int getint(int i) {
  return i;
}
// Gcc-8 has a bug that breaks this version, fixed in gcc-9
// template<class T, class=decltype(std::declval<T>().template cast<int>())>
// int getint(T i){return i.template cast<int>();}
template <class T>
auto getint(T i) -> decltype(i.template cast<int>()) {
  return i.template cast<int>();
}

// Raw clusters are clusters obtained through single-linkage, no merging.

typedef int Point_index;
typedef int Cluster_index;
struct Merge {
  Cluster_index first, second;
  double persist;
};

template <class Neighbors, class Density, class Order, class ROrder>
auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order,
            ROrder const& rorder) {
  // point index --> index of raw cluster it belongs to
  std::vector<Cluster_index> raw_cluster;
  raw_cluster.reserve(num_points);
  // cluster index --> index of top point in the cluster
  Cluster_index n_raw_clusters = 0;  // current number of raw clusters seen
  //
  std::vector<Merge> merges;
  struct Data {
    Cluster_index parent;
    int rank;
    Point_index max;
  };  // information on a cluster
  std::vector<Data> ds_base;
  // boost::vector_property_map does resize(size+1) for every new element, don't use it
  auto ds_data =
      boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n) -> Data& { return ds_base[n]; });
  auto ds_parent =
      boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data);
  auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data);
  boost::disjoint_sets<decltype(ds_rank), decltype(ds_parent)> ds(
      ds_rank, ds_parent);                         // on the clusters, not directly the points
  std::vector<std::array<double, 2>> persistence;  // diagram (finite points)
  boost::container::flat_map<Cluster_index, Cluster_index>
      adj_clusters;  // first: the merged cluster, second: the raw cluster
  // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only
  // insert in the vector if merged is absent from the set

  for (Point_index i = 0; i < num_points; ++i) {
    // auto&& ngb = neighbors[order[i]];
    // TODO: have a specialization where we don't need python types and py::cast
    // TODO: move py::cast and getint into Neighbors
    py::object ngb = neighbors[py::cast(order[i])];  // auto&& also seems to work
    adj_clusters.clear();
    Point_index j = i;  // highest neighbor
    // for(Point_index k : ngb)
    for (auto k_py : ngb) {
      Point_index k = rorder[getint(k_py)];
      if (k >= i || k < 0)  // ???
        continue;
      if (k < j) j = k;
      Cluster_index rk = raw_cluster[k];
      adj_clusters.emplace(ds.find_set(rk), rk);
      // does not insert if ck=ds.find_set(rk) already seen
      // which rk we keep from those with the same ck is arbitrary
    }
    assert((Point_index)raw_cluster.size() == i);
    if (i == j) {  // local maximum -> new cluster
      Cluster_index c = n_raw_clusters++;
      ds_base.emplace_back();  // could be integrated in ds_data, but then we would check the size for every access
      ds.make_set(c);
      ds_base[c].max = i;  // max
      raw_cluster.push_back(c);
      continue;
    } else {  // add i to the cluster of j
      assert(j < i);
      raw_cluster.push_back(raw_cluster[j]);
      // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may
      // merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems.
      // We could just not set j at all and use arbitrarily the first element of adj_clusters.
    }
    // possibly merge clusters
    // we could sort, in case there are several merges, but that doesn't seem so useful
    Cluster_index rj = raw_cluster[j];
    Cluster_index cj = ds.find_set(rj);
    Cluster_index orig_cj = cj;
    for (auto ckk : adj_clusters) {
      Cluster_index rk = ckk.second;
      Cluster_index ck = ckk.first;
      if (ck == orig_cj) continue;
      assert(ck == ds.find_set(rk));
      Point_index j = ds_base[cj].max;
      Point_index k = ds_base[ck].max;
      Point_index young = std::max(j, k);
      Point_index old = std::min(j, k);
      auto d_young = density[order[young]];
      auto d_i = density[order[i]];
      assert(d_young >= d_i);
      // Always merge (the non-hierarchical algorithm would only conditionally merge here
      persistence.push_back({d_young, d_i});
      assert(ds.find_set(rj) != ds.find_set(rk));
      ds.link(cj, ck);
      cj = ds.find_set(cj);
      ds_base[cj].max = old;  // just one parent, no need for find_set
      // record the raw clusters, we don't know what will have already been merged.
      merges.push_back({rj, rk, d_young - d_i});
    }
  }
  {
    boost::counting_iterator<int> b(0), e(ds_base.size());
    ds.compress_sets(b, e);
    // Now we stop using find_sets and look at the parent directly
    // rank is reused to rename clusters contiguously 0, 1, etc
  }
  // Maximum for each connected component
  std::vector<double> max_cc;
  for (Cluster_index i = 0; i < n_raw_clusters; ++i) {
    if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]);
  }
  assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters);

  // TODO: create a "noise" cluster, merging all those not prominent enough?

  // Replay the merges, in increasing order of prominence, to build the hierarchy
  std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; });
  std::vector<std::array<Cluster_index, 2>> children;
  children.reserve(merges.size());
  {
    struct Dat {
      Cluster_index parent;
      int rank;
      Cluster_index name;
    };
    std::vector<Dat> ds_bas(2 * n_raw_clusters - 1);
    Cluster_index i;
    auto ds_dat =
        boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
    auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
    auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
    boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
    for (i = 0; i < n_raw_clusters; ++i) {
      ds.make_set(i);
      ds_bas[i].name = i;
    }
    for (Merge const& m : merges) {
      Cluster_index j = ds.find_set(m.first);
      Cluster_index k = ds.find_set(m.second);
      assert(j != k);
      children.push_back({ds_bas[j].name, ds_bas[k].name});
      ds.make_set(i);
      ds.link(i, j);
      ds.link(i, k);
      ds_bas[ds.find_set(i)].name = i;
      ++i;
    }
  }

  std::vector<Cluster_index> raw_cluster_ordered(num_points);
  for (int i = 0; i < num_points; ++i) raw_cluster_ordered[i] = raw_cluster[rorder[i]];
  // return raw_cluster, children, persistence
  // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042
  return py::make_tuple(py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()),
                        py::array(children.size(), children.data()), py::array(persistence.size(), persistence.data()),
                        py::array(max_cc.size(), max_cc.data()));
}

auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final) {
  if (n_final > n_leaves) {
    std::cerr << "The number of clusters required " << n_final << " is larger than the number of mini-clusters " << n_leaves << '\n';
    n_final = n_leaves; // or return something special and let Tomato use leaf_labels_?
  }
  py::buffer_info cbuf = children.request();
  if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0))
    throw std::runtime_error("internal error: children have to be (n,2) or empty");
  const int n_merges = cbuf.shape[0];
  Cluster_index* d = (Cluster_index*)cbuf.ptr;
  if (n_merges + n_final < n_leaves) {
    std::cerr << "The number of clusters required " << n_final << " is smaller than the number of connected components " << n_leaves - n_merges << '\n';
    n_final = n_leaves - n_merges;
  }
  struct Dat {
    Cluster_index parent;
    int rank;
    int name;
  };
  std::vector<Dat> ds_bas(2 * n_leaves - 1);
  auto ds_dat = boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; });
  auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat);
  auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat);
  boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par);
  Cluster_index i;
  for (i = 0; i < n_leaves; ++i) {
    ds.make_set(i);
    ds_bas[i].name = -1;
  }
  for (Cluster_index m = 0; m < n_leaves - n_final; ++m) {
    Cluster_index j = ds.find_set(d[2 * m]);
    Cluster_index k = ds.find_set(d[2 * m + 1]);
    assert(j != k);
    ds.make_set(i);
    ds.link(i, j);
    ds.link(i, k);
    ++i;
  }
  Cluster_index next_cluster_name = 0;
  std::vector<Cluster_index> ret;
  ret.reserve(n_leaves);
  for (Cluster_index j = 0; j < n_leaves; ++j) {
    Cluster_index k = ds.find_set(j);
    if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++;
    ret.push_back(ds_bas[k].name);
  }
  return py::array(ret.size(), ret.data());
}

// TODO: Do a special version when ngb is a numpy array, where we can cast to int[k][n] ?
// py::isinstance<py::array_t<std::int32_t>> (ou py::isinstance<py::array> et tester dtype) et flags&c_style
// ou overload (en virant forcecast?)
// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ?
auto hierarchy(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density) {
  // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted
  // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to []
  // (I _think_ handle is ok and we don't need object here)
  py::buffer_info wbuf = density.request();
  if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D");
  const int n = wbuf.shape[0];
  double* d = (double*)wbuf.ptr;
  // Vector { 0, 1, ..., n-1 }
  std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n));
  // Permutation of the indices to get points in decreasing order of density
  std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; });
  // Inverse permutation
  std::vector<Point_index> rorder(n);
  for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i;
  // Used as:
  // order[i] is the index of the point with i-th largest density
  // rorder[i] is the rank of the i-th point in order of decreasing density
  // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the
  // output)
  return tomato(n, ngb, d, order, rorder);
}

PYBIND11_MODULE(_tomato, m) {
  m.doc() = "Internals of tomato clustering";
  m.def("hierarchy", &hierarchy, "does the clustering");
  m.def("merge", &merge, "merge clusters");
}