diff options
Diffstat (limited to 'src/python/gudhi/clustering')
-rw-r--r-- | src/python/gudhi/clustering/__init__.py | 0 | ||||
-rw-r--r-- | src/python/gudhi/clustering/_tomato.cc | 277 | ||||
-rw-r--r-- | src/python/gudhi/clustering/tomato.py | 321 |
3 files changed, 598 insertions, 0 deletions
diff --git a/src/python/gudhi/clustering/__init__.py b/src/python/gudhi/clustering/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/python/gudhi/clustering/__init__.py diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc new file mode 100644 index 00000000..a76a2c3a --- /dev/null +++ b/src/python/gudhi/clustering/_tomato.cc @@ -0,0 +1,277 @@ +/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + * Author(s): Marc Glisse + * + * Copyright (C) 2020 Inria + * + * Modification(s): + * - YYYY/MM Author: Description of the modification + */ + +#include <boost/container/flat_map.hpp> +#include <boost/pending/disjoint_sets.hpp> +#include <boost/property_map/property_map.hpp> +#include <boost/property_map/transform_value_property_map.hpp> +#include <boost/property_map/vector_property_map.hpp> +#include <boost/property_map/function_property_map.hpp> +#include <boost/iterator/counting_iterator.hpp> +#include <boost/range/irange.hpp> +#include <boost/range/adaptor/transformed.hpp> +#include <vector> +#include <unordered_map> +#include <pybind11/pybind11.h> +#include <pybind11/numpy.h> +#include <iostream> + +namespace py = pybind11; + +template <class T, class = std::enable_if_t<std::is_integral<T>::value>> +int getint(int i) { + return i; +} +// Gcc-8 has a bug that breaks this version, fixed in gcc-9 +// template<class T, class=decltype(std::declval<T>().template cast<int>())> +// int getint(T i){return i.template cast<int>();} +template <class T> +auto getint(T i) -> decltype(i.template cast<int>()) { + return i.template cast<int>(); +} + +// Raw clusters are clusters obtained through single-linkage, no merging. + +typedef int Point_index; +typedef int Cluster_index; +struct Merge { + Cluster_index first, second; + double persist; +}; + +template <class Neighbors, class Density, class Order, class ROrder> +auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { + // point index --> index of raw cluster it belongs to + std::vector<Cluster_index> raw_cluster; + raw_cluster.reserve(num_points); + // cluster index --> index of top point in the cluster + Cluster_index n_raw_clusters = 0; // current number of raw clusters seen + // + std::vector<Merge> merges; + struct Data { + Cluster_index parent; + int rank; + Point_index max; + }; // information on a cluster + std::vector<Data> ds_base; + // boost::vector_property_map does resize(size+1) for every new element, don't use it + auto ds_data = + boost::make_function_property_map<std::size_t>([&ds_base](std::size_t n) -> Data& { return ds_base[n]; }); + auto ds_parent = + boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data); + auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data); + boost::disjoint_sets<decltype(ds_rank), decltype(ds_parent)> ds( + ds_rank, ds_parent); // on the clusters, not directly the points + std::vector<std::array<double, 2>> persistence; // diagram (finite points) + boost::container::flat_map<Cluster_index, Cluster_index> + adj_clusters; // first: the merged cluster, second: the raw cluster + // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only + // insert in the vector if merged is absent from the set + + for (Point_index i = 0; i < num_points; ++i) { + // auto&& ngb = neighbors[order[i]]; + // TODO: have a specialization where we don't need python types and py::cast + // TODO: move py::cast and getint into Neighbors + py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work + adj_clusters.clear(); + Point_index j = i; // highest neighbor + // for(Point_index k : ngb) + for (auto k_py : ngb) { + Point_index k = rorder[getint(k_py)]; + if (k >= i || k < 0) // ??? + continue; + if (k < j) j = k; + Cluster_index rk = raw_cluster[k]; + adj_clusters.emplace(ds.find_set(rk), rk); + // does not insert if ck=ds.find_set(rk) already seen + // which rk we keep from those with the same ck is arbitrary + } + assert((Point_index)raw_cluster.size() == i); + if (i == j) { // local maximum -> new cluster + Cluster_index c = n_raw_clusters++; + ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access + ds.make_set(c); + ds_base[c].max = i; // max + raw_cluster.push_back(c); + continue; + } else { // add i to the cluster of j + assert(j < i); + raw_cluster.push_back(raw_cluster[j]); + // FIXME: we are adding point i to the raw cluster of j, but that one might not be in adj_clusters, so we may + // merge clusters A and B through a point of C. It is strange, but I don't know if it can really cause problems. + // We could just not set j at all and use arbitrarily the first element of adj_clusters. + } + // possibly merge clusters + // we could sort, in case there are several merges, but that doesn't seem so useful + Cluster_index rj = raw_cluster[j]; + Cluster_index cj = ds.find_set(rj); + Cluster_index orig_cj = cj; + for (auto ckk : adj_clusters) { + Cluster_index rk = ckk.second; + Cluster_index ck = ckk.first; + if (ck == orig_cj) continue; + assert(ck == ds.find_set(rk)); + Point_index j = ds_base[cj].max; + Point_index k = ds_base[ck].max; + Point_index young = std::max(j, k); + Point_index old = std::min(j, k); + auto d_young = density[order[young]]; + auto d_i = density[order[i]]; + assert(d_young >= d_i); + // Always merge (the non-hierarchical algorithm would only conditionally merge here + persistence.push_back({d_young, d_i}); + assert(ds.find_set(rj) != ds.find_set(rk)); + ds.link(cj, ck); + cj = ds.find_set(cj); + ds_base[cj].max = old; // just one parent, no need for find_set + // record the raw clusters, we don't know what will have already been merged. + merges.push_back({rj, rk, d_young - d_i}); + } + } + { + boost::counting_iterator<int> b(0), e(ds_base.size()); + ds.compress_sets(b, e); + // Now we stop using find_sets and look at the parent directly + // rank is reused to rename clusters contiguously 0, 1, etc + } + // Maximum for each connected component + std::vector<double> max_cc; + for (Cluster_index i = 0; i < n_raw_clusters; ++i) { + if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]); + } + assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters); + + // TODO: create a "noise" cluster, merging all those not prominent enough? + + // Replay the merges, in increasing order of prominence, to build the hierarchy + std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; }); + std::vector<std::array<Cluster_index, 2>> children; + children.reserve(merges.size()); + { + struct Dat { + Cluster_index parent; + int rank; + Cluster_index name; + }; + std::vector<Dat> ds_bas(2 * n_raw_clusters - 1); + Cluster_index i; + auto ds_dat = + boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par); + for (i = 0; i < n_raw_clusters; ++i) { + ds.make_set(i); + ds_bas[i].name = i; + } + for (Merge const& m : merges) { + Cluster_index j = ds.find_set(m.first); + Cluster_index k = ds.find_set(m.second); + assert(j != k); + children.push_back({ds_bas[j].name, ds_bas[k].name}); + ds.make_set(i); + ds.link(i, j); + ds.link(i, k); + ds_bas[ds.find_set(i)].name = i; + ++i; + } + } + + std::vector<Cluster_index> raw_cluster_ordered(num_points); + for (int i = 0; i < num_points; ++i) raw_cluster_ordered[i] = raw_cluster[rorder[i]]; + // return raw_cluster, children, persistence + // TODO avoid copies: https://github.com/pybind/pybind11/issues/1042 + return py::make_tuple(py::array(raw_cluster_ordered.size(), raw_cluster_ordered.data()), + py::array(children.size(), children.data()), py::array(persistence.size(), persistence.data()), + py::array(max_cc.size(), max_cc.data())); +} + +auto merge(py::array_t<Cluster_index, py::array::c_style> children, Cluster_index n_leaves, Cluster_index n_final) { + if (n_final > n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is larger than the number of mini-clusters " << n_leaves << '\n'; + n_final = n_leaves; // or return something special and let Tomato use leaf_labels_? + } + py::buffer_info cbuf = children.request(); + if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0)) + throw std::runtime_error("internal error: children have to be (n,2) or empty"); + const int n_merges = cbuf.shape[0]; + Cluster_index* d = (Cluster_index*)cbuf.ptr; + if (n_merges + n_final < n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is smaller than the number of connected components " << n_leaves - n_merges << '\n'; + n_final = n_leaves - n_merges; + } + struct Dat { + Cluster_index parent; + int rank; + int name; + }; + std::vector<Dat> ds_bas(2 * n_leaves - 1); + auto ds_dat = boost::make_function_property_map<std::size_t>([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets<decltype(ds_ran), decltype(ds_par)> ds(ds_ran, ds_par); + Cluster_index i; + for (i = 0; i < n_leaves; ++i) { + ds.make_set(i); + ds_bas[i].name = -1; + } + for (Cluster_index m = 0; m < n_leaves - n_final; ++m) { + Cluster_index j = ds.find_set(d[2 * m]); + Cluster_index k = ds.find_set(d[2 * m + 1]); + assert(j != k); + ds.make_set(i); + ds.link(i, j); + ds.link(i, k); + ++i; + } + Cluster_index next_cluster_name = 0; + std::vector<Cluster_index> ret; + ret.reserve(n_leaves); + for (Cluster_index j = 0; j < n_leaves; ++j) { + Cluster_index k = ds.find_set(j); + if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++; + ret.push_back(ds_bas[k].name); + } + return py::array(ret.size(), ret.data()); +} + +// TODO: Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? +// py::isinstance<py::array_t<std::int32_t>> (ou py::isinstance<py::array> et tester dtype) et flags&c_style +// ou overload (en virant forcecast?) +// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? +auto hierarchy(py::handle ngb, py::array_t<double, py::array::c_style | py::array::forcecast> density) { + // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted + // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] + // (I _think_ handle is ok and we don't need object here) + py::buffer_info wbuf = density.request(); + if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D"); + const int n = wbuf.shape[0]; + double* d = (double*)wbuf.ptr; + // Vector { 0, 1, ..., n-1 } + std::vector<Point_index> order(boost::counting_iterator<Point_index>(0), boost::counting_iterator<Point_index>(n)); + // Permutation of the indices to get points in decreasing order of density + std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; }); + // Inverse permutation + std::vector<Point_index> rorder(n); + for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i; + // Used as: + // order[i] is the index of the point with i-th largest density + // rorder[i] is the rank of the i-th point in order of decreasing density + // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the + // output) + return tomato(n, ngb, d, order, rorder); +} + +PYBIND11_MODULE(_tomato, m) { + m.doc() = "Internals of tomato clustering"; + m.def("hierarchy", &hierarchy, "does the clustering"); + m.def("merge", &merge, "merge clusters"); +} diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py new file mode 100644 index 00000000..fbba3cc8 --- /dev/null +++ b/src/python/gudhi/clustering/tomato.py @@ -0,0 +1,321 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +import numpy +from ..point_cloud.knn import KNearestNeighbors +from ..point_cloud.dtm import DTMDensity +from ._tomato import * + +# The fit/predict interface is not so well suited... + + +class Tomato: + """ + This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. + A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural + to provide your own. + + :Requires: `SciPy <installation.html#scipy>`_, `Scikit-learn <installation.html#scikit-learn>`_ or others + (see :class:`~gudhi.point_cloud.knn.KNearestNeighbors`) in function of the options. + + Attributes + ---------- + n_clusters_: int + The number of clusters. Writing to it automatically adjusts `labels_`. + merge_threshold_: float + minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts `labels_`. + n_leaves_: int + number of leaves (unstable clusters) in the hierarchical tree + leaf_labels_: ndarray of shape (n_samples,) + cluster labels for each point, at the very bottom of the hierarchy + labels_: ndarray of shape (n_samples,) + cluster labels for each point, after merging + diagram_: ndarray of shape (`n_leaves_`, 2) + persistence diagram (only the finite points) + max_weight_per_cc_: ndarray of shape (n_connected_components,) + maximum of the density function on each connected component. This corresponds to the abscissa of infinite + points in the diagram + children_: ndarray of shape (`n_leaves_`-n_connected_components, 2) + The children of each non-leaf node. Values less than `n_leaves_` correspond to leaves of the tree. + A node i greater than or equal to `n_leaves_` is a non-leaf node and has children children_[i - `n_leaves_`]. + Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_leaves_` + i + weights_: ndarray of shape (n_samples,) + weights of the points, as computed by the density estimator or provided by the user + params_: dict + Parameters like metric, etc + """ + + def __init__( + self, + graph_type="knn", + density_type="logDTM", + n_clusters=None, + merge_threshold=None, + # eliminate_threshold=None, + # eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated + **params + ): + """ + Args: + graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'. + density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points, + 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'. + metric (str|Callable): metric used when calculating the distance between instances in a feature array. + Defaults to Minkowski of parameter p. + kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to + sklearn.neighbors.KernelDensity. + k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. + k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). + Defaults to k. + r (float): size of a neighborhood if graph_type is 'radius'. Also used as default bandwidth in kde_params. + eps (float): (1+eps) approximation factor when computing distances (ignored in many cases). + n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get + the maximal number of clusters. + merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. + symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. + This can be useful with k-NN for small k. Defaults to false. + p (float): norm L^p on input points. Defaults to 2. + q (float): order used to compute the distance to measure. Defaults to dim. + Beware that when the dimension is large, this can easily cause overflows. + dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the + dimension, or 2 when the dimension cannot be read from the input (metric is "precomputed"). + n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. + If -1 is given all processors are used. Default: 1. + params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and + :class:`~gudhi.point_cloud.dtm.DTMDensity`. + """ + # Should metric='precomputed' mean input_type='distance_matrix'? + # Should we be able to pass metric='minkowski' (what None does currently)? + self.graph_type_ = graph_type + self.density_type_ = density_type + self.params_ = params + self.__n_clusters = n_clusters + self.__merge_threshold = merge_threshold + # self.eliminate_threshold_ = eliminate_threshold + if n_clusters and merge_threshold: + raise ValueError("Cannot specify both a merge threshold and a number of clusters") + + def fit(self, X, y=None, weights=None): + """ + Args: + X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points, + or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors + for each point (points are represented by their index, starting from 0) if graph_type is "manual". + The number of points is currently limited to about 2 billion. + weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point + y: Not used, present here for API consistency with scikit-learn by convention. + """ + # TODO: First detect if this is a new call with the same data (only threshold changed?) + # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. + if weights is not None: + density_type = "manual" + else: + density_type = self.density_type_ + if density_type == "manual": + raise ValueError("If density_type is 'manual', you must provide weights to fit()") + + if self.graph_type_ == "manual": + self.neighbors_ = X + # FIXME: uniformize "message 'option'" vs 'message "option"' + assert density_type == "manual", 'If graph_type is "manual", density_type must be as well' + else: + metric = self.params_.get("metric", "minkowski") + if metric != "precomputed": + self.points_ = X + + # Slight complication to avoid computing knn twice. + need_knn = 0 + need_knn_ngb = False + need_knn_dist = False + if self.graph_type_ == "knn": + k_graph = self.params_.get("k", 10) + # If X has fewer than k points... + if k_graph > len(X): + k_graph = len(X) + need_knn = k_graph + need_knn_ngb = True + if self.density_type_ in ["DTM", "logDTM"]: + k = self.params_.get("k", 10) + k_DTM = self.params_.get("k_DTM", k) + # If X has fewer than k points... + if k_DTM > len(X): + k_DTM = len(X) + need_knn = max(need_knn, k_DTM) + need_knn_dist = True + # if we ask for more neighbors for the graph than the DTM, getting the distances is a slight waste, + # but it looks negligible + if need_knn > 0: + knn_args = dict(self.params_) + knn_args["k"] = need_knn + knn = KNearestNeighbors(return_index=need_knn_ngb, return_distance=need_knn_dist, **knn_args).fit_transform( + X + ) + if need_knn_ngb: + if need_knn_dist: + self.neighbors_ = knn[0][:, 0:k_graph] + knn_dist = knn[1] + else: + self.neighbors_ = knn + elif need_knn_dist: + knn_dist = knn + if self.density_type_ in ["DTM", "logDTM"]: + dim = self.params_.get("dim") + if dim is None: + dim = len(X[0]) if metric != "precomputed" else 2 + q = self.params_.get("q", dim) + weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist) + if self.density_type_ == "logDTM": + weights = numpy.log(weights) + + if self.graph_type_ == "radius": + if metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: + from scipy.spatial import cKDTree + + tree = cKDTree(X) + # TODO: handle "l1" and "l2" aliases? + p = self.params_.get("p") + if metric == "euclidean": + assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'" + p = 2 + elif metric == "manhattan": + assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'" + p = 1 + elif metric == "chebyshev": + assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'" + p = numpy.inf + elif p is None: + p = 2 # the default + eps = self.params_.get("eps", 0) + self.neighbors_ = tree.query_ball_tree(tree, r=self.params_["r"], p=p, eps=eps) + + # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree + # (don't bother with the _graph variant, it just calls radius_neighbors). + elif metric != "precomputed": + from sklearn.metrics import pairwise_distances + + X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs")) + metric = "precomputed" + + if metric == "precomputed": + # TODO: parallelize? May not be worth it. + X = numpy.asarray(X) + r = self.params_["r"] + self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] + + if self.density_type_ in {"KDE", "logKDE"}: + # Slow... + assert ( + self.graph_type_ != "manual" and metric != "precomputed" + ), "Scikit-learn's KernelDensity requires point coordinates" + kde_params = dict(self.params_.get("kde_params", dict())) + kde_params.setdefault("metric", metric) + r = self.params_.get("r") + if r is not None: + kde_params.setdefault("bandwidth", r) + # Should we default rtol to eps? + from sklearn.neighbors import KernelDensity + + weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) + if self.density_type_ == "KDE": + weights = numpy.exp(weights) + + # TODO: do it at the C++ level and/or in parallel if this is too slow? + if self.params_.get("symmetrize_graph"): + self.neighbors_ = [set(line) for line in self.neighbors_] + for i, line in enumerate(self.neighbors_): + line.discard(i) + for j in line: + self.neighbors_[j].add(i) + + self.weights_ = weights + # This is where the main computation happens + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = hierarchy(self.neighbors_, weights) + self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) + assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) + # TODO: deduplicate this code with the setters below + if self.__merge_threshold: + assert not self.__n_clusters + self.__n_clusters = numpy.count_nonzero( + self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold + ) + len(self.max_weight_per_cc_) + if self.__n_clusters: + # TODO: set corresponding merge_threshold? + renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) + self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible. + # TODO: check for impossible situations before calling merge. + self.__n_clusters = self.labels_.max() + 1 + else: + self.labels_ = self.leaf_labels_ + self.__n_clusters = self.n_leaves_ + return self + + def fit_predict(self, X, y=None, weights=None): + """ + Equivalent to fit(), and returns the `labels_`. + """ + return self.fit(X, y, weights).labels_ + + # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? + def plot_diagram(self): + """ + """ + import matplotlib.pyplot as plt + + l = self.max_weight_per_cc_.min() + r = self.max_weight_per_cc_.max() + if self.diagram_.size > 0: + plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") + l = min(l, self.diagram_[:, 1].min()) + r = max(r, self.diagram_[:, 0].max()) + if l == r: + if l > 0: + l, r = 0.9 * l, 1.1 * r + elif l < 0: + l, r = 1.1 * l, 0.9 * r + else: + l, r = -1.0, 1.0 + plt.plot([l, r], [l, r]) + plt.plot( + self.max_weight_per_cc_, numpy.full(self.max_weight_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" + ) + plt.show() + + # Use set_params instead? + @property + def n_clusters_(self): + return self.__n_clusters + + @n_clusters_.setter + def n_clusters_(self, n_clusters): + if n_clusters == self.__n_clusters: + return + self.__n_clusters = n_clusters + self.__merge_threshold = None + if hasattr(self, "leaf_labels_"): + renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) + self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible + self.__n_clusters = self.labels_.max() + 1 + + @property + def merge_threshold_(self): + return self.__merge_threshold + + @merge_threshold_.setter + def merge_threshold_(self, merge_threshold): + if merge_threshold == self.__merge_threshold: + return + if hasattr(self, "leaf_labels_"): + self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 0] - self.diagram_[:, 1] > merge_threshold) + len( + self.max_weight_per_cc_ + ) + else: + self.__n_clusters = None + self.__merge_threshold = merge_threshold |