From 07c7e5841c961869b927875bbca91d10287f9fab Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 22:03:18 +0100 Subject: Initial commit of tomato --- src/python/gudhi/clustering/__init__.py | 0 src/python/gudhi/clustering/_tomato.cc | 271 ++++++++++++++++++++++++++ src/python/gudhi/clustering/tomato.py | 327 ++++++++++++++++++++++++++++++++ 3 files changed, 598 insertions(+) create mode 100644 src/python/gudhi/clustering/__init__.py create mode 100644 src/python/gudhi/clustering/_tomato.cc create mode 100644 src/python/gudhi/clustering/tomato.py (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/__init__.py b/src/python/gudhi/clustering/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc new file mode 100644 index 00000000..1c896174 --- /dev/null +++ b/src/python/gudhi/clustering/_tomato.cc @@ -0,0 +1,271 @@ +// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix` +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py=pybind11; + +template::value>> +int getint(int i){return i;} +template().template cast())> +int getint(T i){return i.template cast();} + +// Raw clusters are clusters obtained through single-linkage, no merging. + +typedef int Point_index; +typedef int Cluster_index; +struct Merge { Cluster_index first, second; double persist; }; + +template +auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){ + // point index --> index of raw cluster it belongs to + std::vector raw_cluster; raw_cluster.reserve(num_points); + // cluster index --> index of top point in the cluster + Cluster_index n_raw_clusters = 0; // current number of raw clusters seen + // + std::vector merges; + struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster + std::vector ds_base; + // boost::vector_property_map does resize(size+1) for every new element, don't use it + auto ds_data=boost::make_function_property_map([&ds_base](std::size_t n)->Data&{return ds_base[n];}); + auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data); + auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data); + boost::disjoint_sets ds(ds_rank,ds_parent); // on the clusters, not directly the points + std::vector> persistence; // diagram (finite points) + boost::container::flat_map adj_clusters; // first: the merged cluster, second: the raw cluster + // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set + + for(Point_index i = 0; i < num_points; ++i){ + // auto&& ngb = neighbors[order[i]]; + // TODO: have a specialization where we don't need python types and py::cast + // TODO: move py::cast and getint into Neighbors + py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work + adj_clusters.clear(); + Point_index j = i; // highest neighbor + //for(Point_index k : ngb) + for(auto k_py : ngb) + { + Point_index k = rorder[getint(k_py)]; + if(k >= i || k < 0) // ??? + continue; + if(k < j) + j = k; + Cluster_index rk=raw_cluster[k]; + adj_clusters.emplace(ds.find_set(rk),rk); + // does not insert if ck=ds.find_set(rk) already seen + // which rk we keep from those with the same ck is arbitrary + } + assert((Point_index)raw_cluster.size()==i); + if(i==j){ // local maximum -> new cluster + Cluster_index c = n_raw_clusters++; + ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access + ds.make_set(c); + ds_base[c].max=i; // max + raw_cluster.push_back(c); + continue; + }else{ // add i to the cluster of j + assert(j= d_i); + // Always merge (the non-hierarchical algorithm would only conditionally merge here + persistence.push_back({d_young, d_i}); + assert(ds.find_set(rj) != ds.find_set(rk)); + ds.link(cj,ck); + cj = ds.find_set(cj); + ds_base[cj].max=old; // just one parent, no need for find_set + // record the raw clusters, we don't know what will have already been merged. + merges.push_back({rj, rk, d_young-d_i}); + } + } + { + boost::counting_iterator b(0), e(ds_base.size()); + ds.compress_sets(b, e); + // Now we stop using find_sets and look at the parent directly + // rank is reused to rename clusters contiguously 0, 1, etc + } + // Maximum for each connected component + std::vector max_cc; + //for(Point_index i = 0; i < num_points; ++i){ + // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) + // max_cc.push_back(density[order[i]]); + //} + for(Cluster_index i = 0; i < n_raw_clusters; ++i){ + if(ds_base[i].parent == i) + max_cc.push_back(density[order[ds_base[i].max]]); + } + assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters); + + // TODO: create a "noise" cluster, merging all those not prominent enough? + //int nb_clusters=0; + //for(int i=0;i<(int)ds_base.size();++i){ + // if(ds_parent[i]!=i) continue; + // ds_data[i].rank=nb_clusters++; + //} + + // Replay the merges, in increasing order of prominence, to build the hierarchy + std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;}); + std::vector> children; children.reserve(merges.size()); + { + struct Dat { Cluster_index parent; int rank; Cluster_index name; }; + std::vector ds_bas(2*n_raw_clusters-1); + Cluster_index i; + auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); + auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); + auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); + boost::disjoint_sets ds(ds_ran,ds_par); + for(i=0;i raw_cluster_ordered(num_points); + for(int i=0; i children, Cluster_index n_leaves, Cluster_index n_final){ + // Should this really be an error? + if(n_final > n_leaves) + throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); + py::buffer_info cbuf = children.request(); + if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0)) + throw std::runtime_error("internal error: children have to be (n,2) or empty"); + const int n_merges=cbuf.shape[0]; + Cluster_index*d=(Cluster_index*)cbuf.ptr; + // Should this really be an error? + //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; + if(n_merges + n_final < n_leaves) + throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges)); + + struct Dat { Cluster_index parent; int rank; int name; }; + std::vector ds_bas(2*n_leaves-1); + auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); + auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); + auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); + boost::disjoint_sets ds(ds_ran,ds_par); + Cluster_index i; + for(i=0;i ret; ret.reserve(n_leaves); + for(Cluster_index j=0;j density){ + // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted + // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] + // (I _think_ handle is ok and we don't need object here) + py::buffer_info wbuf = density.request(); + if(wbuf.ndim!=1) + throw std::runtime_error("density must be 1D"); + const int n=wbuf.shape[0]; + double*d=(double*)wbuf.ptr; + // Vector { 0, 1, ..., n-1 } + std::vector order(boost::counting_iterator(0), boost::counting_iterator(n)); + // Permutation of the indices to get points in decreasing order of density + std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; }); + // Inverse permutation + std::vector rorder(n); + for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i; + // Used as: + // order[i] is the index of the point with i-th largest density + // rorder[i] is the rank of the i-th point in order of decreasing density + // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output) + return tomato(n, ngb, d, order, rorder); +} +#if 0 +struct Vec { + int const* base; + int k; + int const* begin()const{return base;} + int const* end()const{return base+k;} + Vec operator*()const{return *this;} +}; +auto plaf(py::array_t ngb, py::array_t density, double threshold){ + py::buffer_info nbuf = ngb.request(); + if(nbuf.ndim!=2) + throw std::runtime_error("neighbors must be 2D"); + const int n=nbuf.shape[0]; + const int k=nbuf.shape[1]; + int*nptr=(int*)nbuf.ptr; + auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};}); + py::buffer_info wbuf = density.request(); + if(wbuf.ndim!=1) + throw std::runtime_error("density must be 1D"); + if(n!=wbuf.shape[0]) + throw std::runtime_error("incompatible sizes"); + double*d=(double*)wbuf.ptr; + return tomato(n,neighbors,d); +} +#endif +PYBIND11_MODULE(_tomato, m) { + m.doc() = "Internals of tomato clustering"; + m.def("doit", &plouf, "does the clustering"); + //m.def("doit2", &plaf, "does the clustering faster"); + m.def("merge", &merge, "merge clusters"); +} + +// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array +// +// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style +// ou overload (en virant forcecast?) +// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py new file mode 100644 index 00000000..e4430dbd --- /dev/null +++ b/src/python/gudhi/clustering/tomato.py @@ -0,0 +1,327 @@ +import numpy +from ._tomato import * + +# The fit/predict interface is not so well suited... +class Tomato: + """ + Clustering + + This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet. + + Attributes + ---------- + n_clusters_: int + The number of clusters. Writing to it automatically adjusts labels_. + merge_threshold_: float + minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_. + n_leaves_: int + number of leaves in the hierarchical tree + leaf_labels_: ndarray of shape (n_samples) + cluster labels for each point, at the very bottom of the hierarchy + labels_: ndarray of shape (n_samples) + cluster labels for each point + diagram_: ndarray of shape (n_leaves_,2) + persistence diagram (only the finite points) + max_density_per_cc_: ndarray os shape (n_connected_components) + maximum of the density function on each connected component + children_: ndarray of shape (n_leaves_-1,2) + The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i + params_: dict + Parameters like input_type, metric, etc + """ + def __init__(self, input_type='points', metric=None, graph_type=None, density_type='manual', n_clusters=None, merge_threshold=None, eliminate_threshold=None, **params): + """ + Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. + + Parameters + ---------- + input_type(str): 'points', 'distance_matrix' or 'neighbors'. + metric(str or callable): FIXME ??? + graph_type(str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. + density_type(str): 'manual', 'DTM', 'logDTM' or 'kde'. + kde_params(dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity. + k(int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. + k_DTM(int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. + r(float): size of a neighborhood if graph_type is 'radius' + eps(float): approximation factor when computing nearest neighbors without a GPU + gpu(bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). + n_clusters(int): number of clusters requested. Defaults to ??? + merge_threshold(float): minimum prominence of a cluster so it doesn't get merged. + eliminate_threshold(float): minimum height of a cluster so it doesn't get eliminated + symmetrize_graph(bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. + p(float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. + p_DTM(float): order used to compute the distance to measure. Defaults to 2. + n_jobs(int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. + """ + self.input_type_ = input_type + self.metric_ = metric + self.graph_type_ = graph_type + self.density_type_ = density_type + self.params_ = params + self.__n_clusters = n_clusters + self.__merge_threshold = merge_threshold + self.eliminate_threshold_ = eliminate_threshold + if n_clusters and merge_threshold: + raise ValueError("Cannot specify both a merge threshold and a number of clusters") + + def fit(self, X, y=None, weights=None): + """ + Parameters + ---------- + X(?): points or distance_matrix or list of neighbors + weights(ndarray of shape (n_samples)): if density_type == 'manual', a density estimate at each point + """ + # TODO: First detect if this is a new call with the same data (only threshold changed?) + # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. + if weights: + density_type = 'manual' + else: + density_type = self.density_type_ + assert density_type != 'manual' + if density_type == 'manual': + raise ValueError("If density_type is 'manual', you must provide weights to fit()") + + if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'radius': + X = numpy.array(X) + r = self.params_['r'] + self.neighbors_ = [numpy.nonzero(l <= r) for l in X] + + if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'knn': + k = self.params_['k'] + self.neighbors_ = numpy.argpartition(X, k-1)[:,0:k] + + if self.input_type_ == 'neighbors': + self.neighbors_ = X + assert density_type == 'manual' + + if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + self.points_ = X + q = self.params_.get('p_DTM', 2) + p = self.params_.get('p', 2) + k = self.params_.get('k', 10) + k_DTM = self.params_.get('k_DTM', k) + kmax = max(k, k_DTM) + if self.params_.get('gpu'): + import torch + from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.tensor(self.points_, dtype=torch.float32) + if p == numpy.inf: + # Requires a version of pykeops strictly more recent than 1.3 + dd, nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin_argKmin(kmax, dim=1) + elif p == 2: # Any even integer? + dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin_argKmin(kmax, dim=1) + else: + dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin_argKmin(kmax, dim=1) + if k < kmax: + nn = nn[:, 0:k] + if k_DTM < kmax: + dd = dd[:, 0:k_DTM] + assert q != numpy.inf # for now + if p != numpy.inf: + qp = float(q) / p + else: + qp = q + if qp != 1: + dd = dd**qp + weights = dd.sum(-1) + # **1/q is a waste of time, whether we take another **-.25 or a log + + # Back to the CPU. Not sure this is necessary, or the right way to do it. + weights = numpy.array(weights) + self.neighbors_ = numpy.array(nn) + else: # CPU + from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) + qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs) + if k < kmax: + self.neighbors_ = self.neighbors_[:, 0:k] + if k_DTM < kmax: + dd = dd[:, 0:k_DTM] + #weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd**q).sum(-1) + + # TODO: check the formula in Fred's paper + if self.density_type_ == 'DTM': + weights = weights ** (-.25 / q) + else: + weights = -numpy.log(weights) + + if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ not in {'DTM', 'logDTM'}: + self.points_ = X + p = self.params_.get('p', 2) + k = self.params_.get('k', 10) + if self.params_.get('gpu'): + import torch + from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.tensor(self.points_, dtype=torch.float32) + if p == numpy.inf: + # Requires a version of pykeops strictly more recent than 1.3 + nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).argKmin(k, dim=1) + elif p == 2: # Any even integer? + nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).argKmin(k, dim=1) + else: + nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).argKmin(k, dim=1) + # Back to the CPU. Not sure this is necessary, or the right way to do it. + self.neighbors_ = numpy.array(nn) + else: # CPU + from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) + # FIXME 'p' + qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) + + if self.input_type_ == 'points' and self.graph_type_ != 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + self.points_ = X + q = self.params_.get('p_DTM', 2) + p = self.params_.get('p', 2) + k = self.params_.get('k', 10) + k_DTM = self.params_.get('k_DTM', k) + if self.params_.get('gpu'): + import torch + from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? + XX = torch.tensor(self.points_, dtype=torch.float32) + if p == numpy.inf: + assert False # Not supported??? + dd = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin(k_DTM, dim=1) + elif p == 2: # Any even integer? + dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin(k_DTM, dim=1) + else: + dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin(k_DTM, dim=1) + assert q != numpy.inf # for now + if p != numpy.inf: + qp = float(q) / p + else: + qp = q + if qp != 1: + dd = dd**qp + weights = dd.sum(-1) + # **1/q is a waste of time, whether we take another **-.25 or a log + + # Back to the CPU. Not sure this is necessary, or the right way to do it. + weights = numpy.array(weights) + else: # CPU + from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) + qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs) + #weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd**q).sum(-1) + + # TODO: check the formula in Fred's paper + if self.density_type_ == 'DTM': + weights = weights ** (-.25 / q) + else: + weights = -numpy.log(weights) + + if self.input_type_ == 'distance_matrix' and self.density_type_ in {'DTM', 'logDTM'}: + q = self.params_.get('p_DTM', 2) + X = numpy.array(X) + k = self.params_.get('k_DTM') + if not k: + k = self.params_['k'] + q = self.params_.get('p_DTM', 2) + weights = (numpy.partition(X)**q, k-1).sum(-1) + # TODO: check the formula in Fred's paper + if self.density_type_ == 'DTM': + weights = weights ** (-.25 / q) + else: + weights = -numpy.log(weights) + + if self.density_type_ == 'kde': + # FIXME: replace most assert with raise ValueError("blabla") + assert self.input_type_ == 'points' + kde_params = self.params_.get('kde_params', dict()) + from sklearn.neighbors import KernelDensity + weights = KernelDensity(**kde_params).fit(X).score_samples(X) + + if self.params_.get('symmetrize_graph'): + self.neighbors_ = [set(line) for line in self.neighbors_] + for i,line in enumerate(self.neighbors_): + line.discard(i) + for j in line: + self.neighbors_[j].add(i) + + self.weights_=weights # TODO remove + self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(list(self.neighbors_), weights) + self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) + assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) + if self.__n_clusters: + renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) + self.labels_ = renaming[self.leaf_labels_] + else: + self.labels_ = self.leaf_labels_ + self.__n_clusters = self.n_leaves_ + + def fit_predict(self, X, y=None): + self.fit(X) + return labels_ + + # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? + def plot_diagram(self): + import matplotlib.pyplot as plt + plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro') + l = self.diagram_[:,1].min() + r = max(self.diagram_[:,0].max(), self.max_density_per_cc_.max()) + plt.plot([l,r],[l,r]) + plt.plot(self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape,1.1*l-.1*r),'ro',color='green') + plt.show() + +# def predict(self, X): +# # X had better be the same as in fit() +# return labels_ + + # Use set_params instead? + @property + def n_clusters_(self): + return self.__n_clusters + @n_clusters_.setter + def n_clusters_(self, n_clusters): + if n_clusters == self.__n_clusters: + return + self.__n_clusters = n_clusters + self.__merge_threshold = None + if hasattr(self, 'leaf_labels_'): + renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) + self.labels_ = renaming[self.leaf_labels_] + + @property + def merge_threshold_(self): + return self.__merge_threshold + @merge_threshold_.setter + def merge_threshold_(self, merge_threshold): + if merge_threshold == self.__merge_threshold: + return + if hasattr(self, 'leaf_labels_'): + self.n_clusters_ = numpy.count_nonzero(self.diagram_[1]-self.diagram_[0] > merge_threshold) + len(max_density_per_cc_) + else: + self.__n_clusters = None + self.__merge_threshold = merge_threshold + + +if __name__ == '__main__': + import sys + a=[(1,2),(1.1,1.9),(.9,1.8),(10,0),(10.1,.05),(10.2,-.1),(5.4,0)] + a=numpy.random.rand(500,2) + t=Tomato(input_type='points', metric='Euclidean', graph_type='knn', density_type='DTM', n_clusters=2, k=4, n_jobs=-1, eps=.05) + t.fit(a) + #print("neighbors\n",t.neighbors_) + #print() + #print("weights\n",t.weights_) + #print() + #print("diagram\n",t.diagram_) + #print() + print("max\n",t.max_density_per_cc_, file=sys.stderr) + #print() + print("leaf labels\n",t.leaf_labels_) + #print() + print("labels\n",t.labels_) + #print() + print("children\n",t.children_) + #print() + t.n_clusters_ = 2 + print("labels\n",t.labels_) + t.plot_diagram() -- cgit v1.2.3 From d38d7352d9dd360fa779f83442e84c5375cf62e2 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 22:44:34 +0100 Subject: Doc syntax --- src/python/gudhi/clustering/tomato.py | 48 ++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index e4430dbd..467afe0e 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -33,25 +33,24 @@ class Tomato: """ Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. - Parameters - ---------- - input_type(str): 'points', 'distance_matrix' or 'neighbors'. - metric(str or callable): FIXME ??? - graph_type(str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. - density_type(str): 'manual', 'DTM', 'logDTM' or 'kde'. - kde_params(dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity. - k(int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. - k_DTM(int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. - r(float): size of a neighborhood if graph_type is 'radius' - eps(float): approximation factor when computing nearest neighbors without a GPU - gpu(bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). - n_clusters(int): number of clusters requested. Defaults to ??? - merge_threshold(float): minimum prominence of a cluster so it doesn't get merged. - eliminate_threshold(float): minimum height of a cluster so it doesn't get eliminated - symmetrize_graph(bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. - p(float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. - p_DTM(float): order used to compute the distance to measure. Defaults to 2. - n_jobs(int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. + Args: + input_type (str): 'points', 'distance_matrix' or 'neighbors'. + metric (str or callable): FIXME ??? + graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. + density_type (str): 'manual', 'DTM', 'logDTM' or 'kde'. + kde_params (dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity. + k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. + k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. + r (float): size of a neighborhood if graph_type is 'radius' + eps (float): approximation factor when computing nearest neighbors without a GPU + gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). + n_clusters (int): number of clusters requested. Defaults to ??? + merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. + eliminate_threshold (float): minimum height of a cluster so it doesn't get eliminated + symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. + p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. + p_DTM (float): order used to compute the distance to measure. Defaults to 2. + n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. """ self.input_type_ = input_type self.metric_ = metric @@ -66,10 +65,9 @@ class Tomato: def fit(self, X, y=None, weights=None): """ - Parameters - ---------- - X(?): points or distance_matrix or list of neighbors - weights(ndarray of shape (n_samples)): if density_type == 'manual', a density estimate at each point + Args: + X (?): points or distance_matrix or list of neighbors + weights (ndarray of shape (n_samples)): if density_type == 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. @@ -257,11 +255,15 @@ class Tomato: self.__n_clusters = self.n_leaves_ def fit_predict(self, X, y=None): + """ + """ self.fit(X) return labels_ # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? def plot_diagram(self): + """ + """ import matplotlib.pyplot as plt plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro') l = self.diagram_[:,1].min() -- cgit v1.2.3 From 53579deb2d551752b503b7b76ac04885ec354470 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 23:19:34 +0100 Subject: Bug in gcc-8 --- src/python/gudhi/clustering/_tomato.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 1c896174..746c4254 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -18,8 +18,11 @@ namespace py=pybind11; template::value>> int getint(int i){return i;} -template().template cast())> -int getint(T i){return i.template cast();} +// Gcc-8 has a bug that breaks this version, fixed in gcc-9 +// template().template cast())> +// int getint(T i){return i.template cast();} +template +auto getint(T i)->decltype(i.template cast()){return i.template cast();} // Raw clusters are clusters obtained through single-linkage, no merging. -- cgit v1.2.3 From e5e0f9a9e96389eadc9e9c4bc493b88abcb6f89a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 23:24:42 +0100 Subject: formatting --- src/python/gudhi/clustering/_tomato.cc | 247 +++++++++++++++++-------------- src/python/gudhi/clustering/tomato.py | 262 ++++++++++++++++++++------------- 2 files changed, 299 insertions(+), 210 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 746c4254..87bd62e9 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -14,100 +14,117 @@ #include #include -namespace py=pybind11; +namespace py = pybind11; -template::value>> -int getint(int i){return i;} +template ::value>> +int getint(int i) { + return i; +} // Gcc-8 has a bug that breaks this version, fixed in gcc-9 // template().template cast())> // int getint(T i){return i.template cast();} -template -auto getint(T i)->decltype(i.template cast()){return i.template cast();} +template +auto getint(T i) -> decltype(i.template cast()) { + return i.template cast(); +} // Raw clusters are clusters obtained through single-linkage, no merging. typedef int Point_index; typedef int Cluster_index; -struct Merge { Cluster_index first, second; double persist; }; +struct Merge { + Cluster_index first, second; + double persist; +}; -template -auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){ +template +auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to - std::vector raw_cluster; raw_cluster.reserve(num_points); + std::vector raw_cluster; + raw_cluster.reserve(num_points); // cluster index --> index of top point in the cluster - Cluster_index n_raw_clusters = 0; // current number of raw clusters seen + Cluster_index n_raw_clusters = 0; // current number of raw clusters seen // std::vector merges; - struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster + struct Data { + Cluster_index parent; + int rank; + Point_index max; + }; // information on a cluster std::vector ds_base; // boost::vector_property_map does resize(size+1) for every new element, don't use it - auto ds_data=boost::make_function_property_map([&ds_base](std::size_t n)->Data&{return ds_base[n];}); - auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data); - auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data); - boost::disjoint_sets ds(ds_rank,ds_parent); // on the clusters, not directly the points - std::vector> persistence; // diagram (finite points) - boost::container::flat_map adj_clusters; // first: the merged cluster, second: the raw cluster - // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set + auto ds_data = + boost::make_function_property_map([&ds_base](std::size_t n) -> Data& { return ds_base[n]; }); + auto ds_parent = + boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data); + auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data); + boost::disjoint_sets ds( + ds_rank, ds_parent); // on the clusters, not directly the points + std::vector> persistence; // diagram (finite points) + boost::container::flat_map + adj_clusters; // first: the merged cluster, second: the raw cluster + // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only + // insert in the vector if merged is absent from the set - for(Point_index i = 0; i < num_points; ++i){ + for (Point_index i = 0; i < num_points; ++i) { // auto&& ngb = neighbors[order[i]]; // TODO: have a specialization where we don't need python types and py::cast // TODO: move py::cast and getint into Neighbors - py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work + py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work adj_clusters.clear(); - Point_index j = i; // highest neighbor - //for(Point_index k : ngb) - for(auto k_py : ngb) - { + Point_index j = i; // highest neighbor + // for(Point_index k : ngb) + for (auto k_py : ngb) { Point_index k = rorder[getint(k_py)]; - if(k >= i || k < 0) // ??? - continue; - if(k < j) - j = k; - Cluster_index rk=raw_cluster[k]; - adj_clusters.emplace(ds.find_set(rk),rk); + if (k >= i || k < 0) // ??? + continue; + if (k < j) j = k; + Cluster_index rk = raw_cluster[k]; + adj_clusters.emplace(ds.find_set(rk), rk); // does not insert if ck=ds.find_set(rk) already seen // which rk we keep from those with the same ck is arbitrary } - assert((Point_index)raw_cluster.size()==i); - if(i==j){ // local maximum -> new cluster + assert((Point_index)raw_cluster.size() == i); + if (i == j) { // local maximum -> new cluster Cluster_index c = n_raw_clusters++; - ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access + ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access ds.make_set(c); - ds_base[c].max=i; // max + ds_base[c].max = i; // max raw_cluster.push_back(c); continue; - }else{ // add i to the cluster of j - assert(j= d_i); // Always merge (the non-hierarchical algorithm would only conditionally merge here persistence.push_back({d_young, d_i}); assert(ds.find_set(rj) != ds.find_set(rk)); - ds.link(cj,ck); + ds.link(cj, ck); cj = ds.find_set(cj); - ds_base[cj].max=old; // just one parent, no need for find_set + ds_base[cj].max = old; // just one parent, no need for find_set // record the raw clusters, we don't know what will have already been merged. - merges.push_back({rj, rk, d_young-d_i}); + merges.push_back({rj, rk, d_young - d_i}); } } { @@ -118,121 +135,137 @@ auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& de } // Maximum for each connected component std::vector max_cc; - //for(Point_index i = 0; i < num_points; ++i){ + // for(Point_index i = 0; i < num_points; ++i){ // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) // max_cc.push_back(density[order[i]]); //} - for(Cluster_index i = 0; i < n_raw_clusters; ++i){ - if(ds_base[i].parent == i) - max_cc.push_back(density[order[ds_base[i].max]]); + for (Cluster_index i = 0; i < n_raw_clusters; ++i) { + if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]); } - assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters); + assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters); // TODO: create a "noise" cluster, merging all those not prominent enough? - //int nb_clusters=0; - //for(int i=0;i<(int)ds_base.size();++i){ + // int nb_clusters=0; + // for(int i=0;i<(int)ds_base.size();++i){ // if(ds_parent[i]!=i) continue; // ds_data[i].rank=nb_clusters++; //} // Replay the merges, in increasing order of prominence, to build the hierarchy - std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;}); - std::vector> children; children.reserve(merges.size()); + std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; }); + std::vector> children; + children.reserve(merges.size()); { - struct Dat { Cluster_index parent; int rank; Cluster_index name; }; - std::vector ds_bas(2*n_raw_clusters-1); + struct Dat { + Cluster_index parent; + int rank; + Cluster_index name; + }; + std::vector ds_bas(2 * n_raw_clusters - 1); Cluster_index i; - auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); - auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); - auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); - boost::disjoint_sets ds(ds_ran,ds_par); - for(i=0;i([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets ds(ds_ran, ds_par); + for (i = 0; i < n_raw_clusters; ++i) { + ds.make_set(i); + ds_bas[i].name = i; + } + for (Merge const& m : merges) { Cluster_index j = ds.find_set(m.first); Cluster_index k = ds.find_set(m.second); - assert(j!=k); - children.push_back({ds_bas[j].name,ds_bas[k].name}); + assert(j != k); + children.push_back({ds_bas[j].name, ds_bas[k].name}); ds.make_set(i); - ds.link(i,j); - ds.link(i,k); - ds_bas[ds.find_set(i)].name=i; + ds.link(i, j); + ds.link(i, k); + ds_bas[ds.find_set(i)].name = i; ++i; } } std::vector raw_cluster_ordered(num_points); - for(int i=0; i children, Cluster_index n_leaves, Cluster_index n_final){ +auto merge(py::array_t children, Cluster_index n_leaves, Cluster_index n_final) { // Should this really be an error? - if(n_final > n_leaves) + if (n_final > n_leaves) throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); py::buffer_info cbuf = children.request(); - if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0)) + if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0)) throw std::runtime_error("internal error: children have to be (n,2) or empty"); - const int n_merges=cbuf.shape[0]; - Cluster_index*d=(Cluster_index*)cbuf.ptr; + const int n_merges = cbuf.shape[0]; + Cluster_index* d = (Cluster_index*)cbuf.ptr; // Should this really be an error? - //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; - if(n_merges + n_final < n_leaves) - throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges)); + // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; + if (n_merges + n_final < n_leaves) + throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) + + " is smaller than the number of connected components " + + std::to_string(n_leaves - n_merges)); - struct Dat { Cluster_index parent; int rank; int name; }; - std::vector ds_bas(2*n_leaves-1); - auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); - auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); - auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); - boost::disjoint_sets ds(ds_ran,ds_par); + struct Dat { + Cluster_index parent; + int rank; + int name; + }; + std::vector ds_bas(2 * n_leaves - 1); + auto ds_dat = boost::make_function_property_map([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets ds(ds_ran, ds_par); Cluster_index i; - for(i=0;i ret; ret.reserve(n_leaves); - for(Cluster_index j=0;j ret; + ret.reserve(n_leaves); + for (Cluster_index j = 0; j < n_leaves; ++j) { Cluster_index k = ds.find_set(j); - if(ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++; + if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++; ret.push_back(ds_bas[k].name); } return py::array(ret.size(), ret.data()); } // Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? -auto plouf(py::handle ngb, py::array_t density){ +auto plouf(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] // (I _think_ handle is ok and we don't need object here) py::buffer_info wbuf = density.request(); - if(wbuf.ndim!=1) - throw std::runtime_error("density must be 1D"); - const int n=wbuf.shape[0]; - double*d=(double*)wbuf.ptr; + if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D"); + const int n = wbuf.shape[0]; + double* d = (double*)wbuf.ptr; // Vector { 0, 1, ..., n-1 } std::vector order(boost::counting_iterator(0), boost::counting_iterator(n)); // Permutation of the indices to get points in decreasing order of density - std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; }); + std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; }); // Inverse permutation std::vector rorder(n); - for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i; + for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i; // Used as: // order[i] is the index of the point with i-th largest density // rorder[i] is the rank of the i-th point in order of decreasing density - // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output) + // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the + // output) return tomato(n, ngb, d, order, rorder); } #if 0 @@ -263,7 +296,7 @@ auto plaf(py::array_t ngb, py::a PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; m.def("doit", &plouf, "does the clustering"); - //m.def("doit2", &plaf, "does the clustering faster"); + // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 467afe0e..19d6600f 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -29,7 +29,18 @@ class Tomato: params_: dict Parameters like input_type, metric, etc """ - def __init__(self, input_type='points', metric=None, graph_type=None, density_type='manual', n_clusters=None, merge_threshold=None, eliminate_threshold=None, **params): + + def __init__( + self, + input_type="points", + metric=None, + graph_type=None, + density_type="manual", + n_clusters=None, + merge_threshold=None, + eliminate_threshold=None, + **params + ): """ Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. @@ -72,179 +83,207 @@ class Tomato: # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. if weights: - density_type = 'manual' + density_type = "manual" else: density_type = self.density_type_ - assert density_type != 'manual' - if density_type == 'manual': + assert density_type != "manual" + if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") - if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'radius': + if self.input_type_ == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) - r = self.params_['r'] + r = self.params_["r"] self.neighbors_ = [numpy.nonzero(l <= r) for l in X] - if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'knn': - k = self.params_['k'] - self.neighbors_ = numpy.argpartition(X, k-1)[:,0:k] + if self.input_type_ == "distance_matrix" and self.graph_type_ == "knn": + k = self.params_["k"] + self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k] - if self.input_type_ == 'neighbors': + if self.input_type_ == "neighbors": self.neighbors_ = X - assert density_type == 'manual' + assert density_type == "manual" - if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get('p_DTM', 2) - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - k_DTM = self.params_.get('k_DTM', k) + q = self.params_.get("p_DTM", 2) + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + k_DTM = self.params_.get("k_DTM", k) kmax = max(k, k_DTM) - if self.params_.get('gpu'): + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: # Requires a version of pykeops strictly more recent than 1.3 - dd, nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin_argKmin(kmax, dim=1) - elif p == 2: # Any even integer? - dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin_argKmin(kmax, dim=1) + dd, nn = ( + (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) + .abs() + .max(-1) + .Kmin_argKmin(kmax, dim=1) + ) + elif p == 2: # Any even integer? + dd, nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p) + .sum(-1) + .Kmin_argKmin(kmax, dim=1) + ) else: - dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin_argKmin(kmax, dim=1) + dd, nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) + .sum(-1) + .Kmin_argKmin(kmax, dim=1) + ) if k < kmax: nn = nn[:, 0:k] if k_DTM < kmax: dd = dd[:, 0:k_DTM] - assert q != numpy.inf # for now + assert q != numpy.inf # for now if p != numpy.inf: qp = float(q) / p else: qp = q if qp != 1: - dd = dd**qp + dd = dd ** qp weights = dd.sum(-1) # **1/q is a waste of time, whether we take another **-.25 or a log # Back to the CPU. Not sure this is necessary, or the right way to do it. weights = numpy.array(weights) self.neighbors_ = numpy.array(nn) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs) if k < kmax: self.neighbors_ = self.neighbors_[:, 0:k] if k_DTM < kmax: dd = dd[:, 0:k_DTM] - #weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd**q).sum(-1) + # weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd ** q).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ not in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: self.points_ = X - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - if self.params_.get('gpu'): + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: # Requires a version of pykeops strictly more recent than 1.3 - nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).argKmin(k, dim=1) - elif p == 2: # Any even integer? - nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).argKmin(k, dim=1) + nn = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).argKmin(k, dim=1) + elif p == 2: # Any even integer? + nn = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).argKmin(k, dim=1) else: - nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).argKmin(k, dim=1) + nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p).sum(-1).argKmin(k, dim=1) + ) # Back to the CPU. Not sure this is necessary, or the right way to do it. self.neighbors_ = numpy.array(nn) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) # FIXME 'p' - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) - if self.input_type_ == 'points' and self.graph_type_ != 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get('p_DTM', 2) - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - k_DTM = self.params_.get('k_DTM', k) - if self.params_.get('gpu'): + q = self.params_.get("p_DTM", 2) + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + k_DTM = self.params_.get("k_DTM", k) + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: - assert False # Not supported??? - dd = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin(k_DTM, dim=1) - elif p == 2: # Any even integer? - dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin(k_DTM, dim=1) + assert False # Not supported??? + dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1) + elif p == 2: # Any even integer? + dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1) else: - dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin(k_DTM, dim=1) - assert q != numpy.inf # for now + dd = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) + .sum(-1) + .Kmin(k_DTM, dim=1) + ) + assert q != numpy.inf # for now if p != numpy.inf: qp = float(q) / p else: qp = q if qp != 1: - dd = dd**qp + dd = dd ** qp weights = dd.sum(-1) # **1/q is a waste of time, whether we take another **-.25 or a log # Back to the CPU. Not sure this is necessary, or the right way to do it. weights = numpy.array(weights) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs) - #weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd**q).sum(-1) + # weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd ** q).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.input_type_ == 'distance_matrix' and self.density_type_ in {'DTM', 'logDTM'}: - q = self.params_.get('p_DTM', 2) + if self.input_type_ == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: + q = self.params_.get("p_DTM", 2) X = numpy.array(X) - k = self.params_.get('k_DTM') + k = self.params_.get("k_DTM") if not k: - k = self.params_['k'] - q = self.params_.get('p_DTM', 2) - weights = (numpy.partition(X)**q, k-1).sum(-1) + k = self.params_["k"] + q = self.params_.get("p_DTM", 2) + weights = (numpy.partition(X) ** q, k - 1).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.density_type_ == 'kde': + if self.density_type_ == "kde": # FIXME: replace most assert with raise ValueError("blabla") - assert self.input_type_ == 'points' - kde_params = self.params_.get('kde_params', dict()) + assert self.input_type_ == "points" + kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity + weights = KernelDensity(**kde_params).fit(X).score_samples(X) - if self.params_.get('symmetrize_graph'): + if self.params_.get("symmetrize_graph"): self.neighbors_ = [set(line) for line in self.neighbors_] - for i,line in enumerate(self.neighbors_): + for i, line in enumerate(self.neighbors_): line.discard(i) for j in line: self.neighbors_[j].add(i) - self.weights_=weights # TODO remove - self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(list(self.neighbors_), weights) + self.weights_ = weights # TODO remove + self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit( + list(self.neighbors_), weights + ) self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) if self.__n_clusters: @@ -265,65 +304,82 @@ class Tomato: """ """ import matplotlib.pyplot as plt - plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro') - l = self.diagram_[:,1].min() - r = max(self.diagram_[:,0].max(), self.max_density_per_cc_.max()) - plt.plot([l,r],[l,r]) - plt.plot(self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape,1.1*l-.1*r),'ro',color='green') + + plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") + l = self.diagram_[:, 1].min() + r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max()) + plt.plot([l, r], [l, r]) + plt.plot( + self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" + ) plt.show() -# def predict(self, X): -# # X had better be the same as in fit() -# return labels_ + # def predict(self, X): + # # X had better be the same as in fit() + # return labels_ # Use set_params instead? @property def n_clusters_(self): return self.__n_clusters + @n_clusters_.setter def n_clusters_(self, n_clusters): if n_clusters == self.__n_clusters: return self.__n_clusters = n_clusters self.__merge_threshold = None - if hasattr(self, 'leaf_labels_'): + if hasattr(self, "leaf_labels_"): renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] @property def merge_threshold_(self): return self.__merge_threshold + @merge_threshold_.setter def merge_threshold_(self, merge_threshold): if merge_threshold == self.__merge_threshold: return - if hasattr(self, 'leaf_labels_'): - self.n_clusters_ = numpy.count_nonzero(self.diagram_[1]-self.diagram_[0] > merge_threshold) + len(max_density_per_cc_) + if hasattr(self, "leaf_labels_"): + self.n_clusters_ = numpy.count_nonzero(self.diagram_[1] - self.diagram_[0] > merge_threshold) + len( + max_density_per_cc_ + ) else: self.__n_clusters = None self.__merge_threshold = merge_threshold -if __name__ == '__main__': +if __name__ == "__main__": import sys - a=[(1,2),(1.1,1.9),(.9,1.8),(10,0),(10.1,.05),(10.2,-.1),(5.4,0)] - a=numpy.random.rand(500,2) - t=Tomato(input_type='points', metric='Euclidean', graph_type='knn', density_type='DTM', n_clusters=2, k=4, n_jobs=-1, eps=.05) + + a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] + a = numpy.random.rand(500, 2) + t = Tomato( + input_type="points", + metric="Euclidean", + graph_type="knn", + density_type="DTM", + n_clusters=2, + k=4, + n_jobs=-1, + eps=0.05, + ) t.fit(a) - #print("neighbors\n",t.neighbors_) - #print() - #print("weights\n",t.weights_) - #print() - #print("diagram\n",t.diagram_) - #print() - print("max\n",t.max_density_per_cc_, file=sys.stderr) - #print() - print("leaf labels\n",t.leaf_labels_) - #print() - print("labels\n",t.labels_) - #print() - print("children\n",t.children_) - #print() + # print("neighbors\n",t.neighbors_) + # print() + # print("weights\n",t.weights_) + # print() + # print("diagram\n",t.diagram_) + # print() + print("max\n", t.max_density_per_cc_, file=sys.stderr) + # print() + print("leaf labels\n", t.leaf_labels_) + # print() + print("labels\n", t.labels_) + # print() + print("children\n", t.children_) + # print() t.n_clusters_ = 2 - print("labels\n",t.labels_) + print("labels\n", t.labels_) t.plot_diagram() -- cgit v1.2.3 From f3b7d742580b3c93f8d1d70952a7809f6f52ca80 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 28 Feb 2020 23:23:01 +0100 Subject: Use official formula for DTM --- src/python/gudhi/clustering/tomato.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 19d6600f..5257e487 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -77,8 +77,8 @@ class Tomato: def fit(self, X, y=None, weights=None): """ Args: - X (?): points or distance_matrix or list of neighbors - weights (ndarray of shape (n_samples)): if density_type == 'manual', a density estimate at each point + X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0) + weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. @@ -148,7 +148,6 @@ class Tomato: if qp != 1: dd = dd ** qp weights = dd.sum(-1) - # **1/q is a waste of time, whether we take another **-.25 or a log # Back to the CPU. Not sure this is necessary, or the right way to do it. weights = numpy.array(weights) @@ -166,10 +165,13 @@ class Tomato: # weights = numpy.linalg.norm(dd, axis=1, ord=q) weights = (dd ** q).sum(-1) - # TODO: check the formula in Fred's paper if self.density_type_ == "DTM": - weights = weights ** (-0.25 / q) + # We ignore constant factors, which don't matter for + # clustering, although they do change thresholds + dim = len(self.points_[0]) + weights = weights ** (-dim / q) else: + # We ignore exponents, which become constant factors with log weights = -numpy.log(weights) if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: @@ -245,9 +247,9 @@ class Tomato: # weights = numpy.linalg.norm(dd, axis=1, ord=q) weights = (dd ** q).sum(-1) - # TODO: check the formula in Fred's paper if self.density_type_ == "DTM": - weights = weights ** (-0.25 / q) + dim = len(self.points_[0]) + weights = weights ** (-dim / q) else: weights = -numpy.log(weights) @@ -258,10 +260,10 @@ class Tomato: if not k: k = self.params_["k"] q = self.params_.get("p_DTM", 2) - weights = (numpy.partition(X) ** q, k - 1).sum(-1) - # TODO: check the formula in Fred's paper + weights = (numpy.partition(X, k - 1)[:,0:k] ** q).sum(-1) if self.density_type_ == "DTM": - weights = weights ** (-0.25 / q) + dim = len(self.points_[0]) + weights = weights ** (-dim / q) else: weights = -numpy.log(weights) -- cgit v1.2.3 From 23f0949dd204a4f4b0fec5527b64b5d5eabbebf8 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 28 Feb 2020 23:45:24 +0100 Subject: metric==Callable --- src/python/gudhi/clustering/tomato.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 5257e487..467dd17e 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -34,11 +34,12 @@ class Tomato: self, input_type="points", metric=None, - graph_type=None, - density_type="manual", + graph_type="knn", + density_type="DTM", n_clusters=None, merge_threshold=None, - eliminate_threshold=None, +# eliminate_threshold=None, +# eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated **params ): """ @@ -46,7 +47,7 @@ class Tomato: Args: input_type (str): 'points', 'distance_matrix' or 'neighbors'. - metric (str or callable): FIXME ??? + metric (None|Callable): If None, use Minkowski of parameter p. graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. density_type (str): 'manual', 'DTM', 'logDTM' or 'kde'. kde_params (dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity. @@ -57,7 +58,6 @@ class Tomato: gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). n_clusters (int): number of clusters requested. Defaults to ??? merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. - eliminate_threshold (float): minimum height of a cluster so it doesn't get eliminated symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. p_DTM (float): order used to compute the distance to measure. Defaults to 2. @@ -90,20 +90,26 @@ class Tomato: if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") - if self.input_type_ == "distance_matrix" and self.graph_type_ == "radius": + input_type = self.input_type_ + if input_type == "points" and self.metric_: + from sklearn.metrics import pairwise_distances + X = pairwise_distances(X,metric=self.metric_,n_jobs=self.params_.get("n_jobs")) + input_type="distance_matrix" + + if input_type == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) r = self.params_["r"] self.neighbors_ = [numpy.nonzero(l <= r) for l in X] - if self.input_type_ == "distance_matrix" and self.graph_type_ == "knn": + if input_type == "distance_matrix" and self.graph_type_ == "knn": k = self.params_["k"] self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k] - if self.input_type_ == "neighbors": + if input_type == "neighbors": self.neighbors_ = X assert density_type == "manual" - if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: + if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X q = self.params_.get("p_DTM", 2) p = self.params_.get("p", 2) @@ -174,7 +180,7 @@ class Tomato: # We ignore exponents, which become constant factors with log weights = -numpy.log(weights) - if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: + if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: self.points_ = X p = self.params_.get("p", 2) k = self.params_.get("k", 10) @@ -203,7 +209,7 @@ class Tomato: qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) - if self.input_type_ == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: + if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X q = self.params_.get("p_DTM", 2) p = self.params_.get("p", 2) @@ -253,7 +259,7 @@ class Tomato: else: weights = -numpy.log(weights) - if self.input_type_ == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: + if input_type == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: q = self.params_.get("p_DTM", 2) X = numpy.array(X) k = self.params_.get("k_DTM") @@ -269,7 +275,7 @@ class Tomato: if self.density_type_ == "kde": # FIXME: replace most assert with raise ValueError("blabla") - assert self.input_type_ == "points" + assert input_type == "points" kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity -- cgit v1.2.3 From 7063d82f2fc25ba2819adf7c2dbf430d4f012626 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 29 Feb 2020 10:27:01 +0100 Subject: Reformat --- src/python/gudhi/clustering/tomato.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 467dd17e..dc004bde 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -38,8 +38,8 @@ class Tomato: density_type="DTM", n_clusters=None, merge_threshold=None, -# eliminate_threshold=None, -# eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated + # eliminate_threshold=None, + # eliminate_threshold (float): minimum max weight of a cluster so it doesn't get eliminated **params ): """ @@ -93,8 +93,9 @@ class Tomato: input_type = self.input_type_ if input_type == "points" and self.metric_: from sklearn.metrics import pairwise_distances - X = pairwise_distances(X,metric=self.metric_,n_jobs=self.params_.get("n_jobs")) - input_type="distance_matrix" + + X = pairwise_distances(X, metric=self.metric_, n_jobs=self.params_.get("n_jobs")) + input_type = "distance_matrix" if input_type == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) @@ -266,7 +267,7 @@ class Tomato: if not k: k = self.params_["k"] q = self.params_.get("p_DTM", 2) - weights = (numpy.partition(X, k - 1)[:,0:k] ** q).sum(-1) + weights = (numpy.partition(X, k - 1)[:, 0:k] ** q).sum(-1) if self.density_type_ == "DTM": dim = len(self.points_[0]) weights = weights ** (-dim / q) -- cgit v1.2.3 From 1b1c621370ff887578d7aad8eb7e28ee9ec7812a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 29 Feb 2020 11:15:59 +0100 Subject: default parameter for DTM = dim. doc. --- src/python/gudhi/clustering/tomato.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index dc004bde..9cf4b231 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -2,6 +2,8 @@ import numpy from ._tomato import * # The fit/predict interface is not so well suited... +# TODO: option for a faster, weaker (probabilistic) knn + class Tomato: """ Clustering @@ -15,21 +17,23 @@ class Tomato: merge_threshold_: float minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_. n_leaves_: int - number of leaves in the hierarchical tree + number of leaves (unstable clusters) in the hierarchical tree leaf_labels_: ndarray of shape (n_samples) cluster labels for each point, at the very bottom of the hierarchy labels_: ndarray of shape (n_samples) - cluster labels for each point + cluster labels for each point, after merging diagram_: ndarray of shape (n_leaves_,2) persistence diagram (only the finite points) - max_density_per_cc_: ndarray os shape (n_connected_components) - maximum of the density function on each connected component children_: ndarray of shape (n_leaves_-1,2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i params_: dict Parameters like input_type, metric, etc """ + # Not documented for now, because I am not sure how useful it is. + # max_density_per_cc_: ndarray of shape (n_connected_components) + # maximum of the density function on each connected component + def __init__( self, input_type="points", @@ -54,15 +58,17 @@ class Tomato: k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. r (float): size of a neighborhood if graph_type is 'radius' - eps (float): approximation factor when computing nearest neighbors without a GPU + eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU) gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). n_clusters (int): number of clusters requested. Defaults to ??? merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. - p_DTM (float): order used to compute the distance to measure. Defaults to 2. + p_DTM (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. """ + # Should metric='precomputed' mean input_type='distance_matrix'? + # Should we be able to pass metric='minkowski' (what None does currently)? self.input_type_ = input_type self.metric_ = metric self.graph_type_ = graph_type @@ -77,7 +83,7 @@ class Tomato: def fit(self, X, y=None, weights=None): """ Args: - X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0) + X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0), according to input_type. weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) @@ -112,7 +118,7 @@ class Tomato: if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get("p_DTM", 2) + q = self.params_.get("p_DTM", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) k_DTM = self.params_.get("k_DTM", k) @@ -212,7 +218,7 @@ class Tomato: if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get("p_DTM", 2) + q = self.params_.get("p_DTM", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) k_DTM = self.params_.get("k_DTM", k) @@ -266,7 +272,6 @@ class Tomato: k = self.params_.get("k_DTM") if not k: k = self.params_["k"] - q = self.params_.get("p_DTM", 2) weights = (numpy.partition(X, k - 1)[:, 0:k] ** q).sum(-1) if self.density_type_ == "DTM": dim = len(self.points_[0]) -- cgit v1.2.3 From fa8c487b24b58797398ce3a93a95095b43de23f3 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 2 Mar 2020 18:01:49 +0100 Subject: bugs from incomplete renaming --- src/python/gudhi/clustering/tomato.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 9cf4b231..0d754a62 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -76,7 +76,7 @@ class Tomato: self.params_ = params self.__n_clusters = n_clusters self.__merge_threshold = merge_threshold - self.eliminate_threshold_ = eliminate_threshold + #self.eliminate_threshold_ = eliminate_threshold if n_clusters and merge_threshold: raise ValueError("Cannot specify both a merge threshold and a number of clusters") @@ -295,13 +295,13 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights # TODO remove - self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit( + self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = doit( list(self.neighbors_), weights ) self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) if self.__n_clusters: - renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) + renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] else: self.labels_ = self.leaf_labels_ @@ -311,7 +311,7 @@ class Tomato: """ """ self.fit(X) - return labels_ + return self.labels_ # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? def plot_diagram(self): @@ -344,7 +344,7 @@ class Tomato: self.__n_clusters = n_clusters self.__merge_threshold = None if hasattr(self, "leaf_labels_"): - renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) + renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] @property -- cgit v1.2.3 From 6c8e59dbece96aeacff53c36809c06c087835905 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 5 Mar 2020 07:00:56 +0100 Subject: Fix merge_threshold --- src/python/gudhi/clustering/tomato.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 0d754a62..ab317beb 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -4,6 +4,7 @@ from ._tomato import * # The fit/predict interface is not so well suited... # TODO: option for a faster, weaker (probabilistic) knn + class Tomato: """ Clustering @@ -76,7 +77,7 @@ class Tomato: self.params_ = params self.__n_clusters = n_clusters self.__merge_threshold = merge_threshold - #self.eliminate_threshold_ = eliminate_threshold + # self.eliminate_threshold_ = eliminate_threshold if n_clusters and merge_threshold: raise ValueError("Cannot specify both a merge threshold and a number of clusters") @@ -300,6 +301,11 @@ class Tomato: ) self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) + if self.__merge_threshold: + assert not self.__n_clusters + self.__n_clusters = numpy.count_nonzero( + self.diagram_[:, 1] - self.diagram_[:, 0] > self.__merge_threshold + ) + len(self.max_density_per_cc_) if self.__n_clusters: renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] @@ -330,7 +336,7 @@ class Tomato: # def predict(self, X): # # X had better be the same as in fit() - # return labels_ + # return self.labels_ # Use set_params instead? @property @@ -356,8 +362,8 @@ class Tomato: if merge_threshold == self.__merge_threshold: return if hasattr(self, "leaf_labels_"): - self.n_clusters_ = numpy.count_nonzero(self.diagram_[1] - self.diagram_[0] > merge_threshold) + len( - max_density_per_cc_ + self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 1] - self.diagram_[:, 0] > merge_threshold) + len( + self.max_density_per_cc_ ) else: self.__n_clusters = None -- cgit v1.2.3 From e8851febb643821cb60023a6d4e8759236115f79 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 5 Mar 2020 07:01:42 +0100 Subject: Use logDTM by default (to be revisited) --- src/python/gudhi/clustering/tomato.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index ab317beb..19e6f6e9 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -40,7 +40,7 @@ class Tomato: input_type="points", metric=None, graph_type="knn", - density_type="DTM", + density_type="logDTM", n_clusters=None, merge_threshold=None, # eliminate_threshold=None, -- cgit v1.2.3 From cb265d9262da5beb233ffdec3694d2dd15a9f2fd Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 9 Mar 2020 13:39:27 +0100 Subject: Allow passing weights to fit_predict --- src/python/gudhi/clustering/tomato.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 19e6f6e9..fd8f0e98 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -313,10 +313,11 @@ class Tomato: self.labels_ = self.leaf_labels_ self.__n_clusters = self.n_leaves_ - def fit_predict(self, X, y=None): + def fit_predict(self, X, y=None, weights=None): """ + Equivalent to fit(), and returns the `labels_`. """ - self.fit(X) + self.fit(X, y, weights) return self.labels_ # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? -- cgit v1.2.3 From 1bb20075bea223734dfbd0750e3d787f00388f29 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 9 Mar 2020 15:11:14 +0100 Subject: Test explicitly if weights is None, instead of relying on conversion to Bool. --- src/python/gudhi/clustering/tomato.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index fd8f0e98..e3d814d1 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -89,7 +89,7 @@ class Tomato: """ # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. - if weights: + if weights is not None: density_type = "manual" else: density_type = self.density_type_ -- cgit v1.2.3 From 6aa18ec3b382f045ba7b97c7cbd6462e1de892ef Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 9 Mar 2020 15:19:48 +0100 Subject: Make fit return self --- src/python/gudhi/clustering/tomato.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index e3d814d1..000fdf3d 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -312,13 +312,13 @@ class Tomato: else: self.labels_ = self.leaf_labels_ self.__n_clusters = self.n_leaves_ + return self def fit_predict(self, X, y=None, weights=None): """ Equivalent to fit(), and returns the `labels_`. """ - self.fit(X, y, weights) - return self.labels_ + return self.fit(X, y, weights).labels_ # TODO: add argument k or threshold? Have a version where you can click and it shows the line and the corresponding k? def plot_diagram(self): -- cgit v1.2.3 From 5a87b00145499bb5afeadaef7dec476ae5f826d0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sun, 15 Mar 2020 10:56:44 +0100 Subject: sklearn's score_sample returns log(density) --- src/python/gudhi/clustering/tomato.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 000fdf3d..6906c5bb 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -54,8 +54,8 @@ class Tomato: input_type (str): 'points', 'distance_matrix' or 'neighbors'. metric (None|Callable): If None, use Minkowski of parameter p. graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. - density_type (str): 'manual', 'DTM', 'logDTM' or 'kde'. - kde_params (dict): if density_type is 'kde', additional parameters passed directly to sklearn.neighbors.KernelDensity. + density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. + kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. r (float): size of a neighborhood if graph_type is 'radius' @@ -280,13 +280,14 @@ class Tomato: else: weights = -numpy.log(weights) - if self.density_type_ == "kde": + if self.density_type_ == "KDE" or self.density_type_ == "logKDE": # FIXME: replace most assert with raise ValueError("blabla") assert input_type == "points" kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity - weights = KernelDensity(**kde_params).fit(X).score_samples(X) + if self.density_type_ == "KDE": + weights = numpy.exp(weights) if self.params_.get("symmetrize_graph"): self.neighbors_ = [set(line) for line in self.neighbors_] -- cgit v1.2.3 From f0d24aa8d75b71e7b8771b4271b4ef6a9e296d4a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 17 Mar 2020 10:56:12 +0100 Subject: doc default n_clusters --- src/python/gudhi/clustering/tomato.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 6906c5bb..7b319b5c 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -58,10 +58,10 @@ class Tomato: kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. - r (float): size of a neighborhood if graph_type is 'radius' - eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU) + r (float): size of a neighborhood if graph_type is 'radius'. + eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU). gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). - n_clusters (int): number of clusters requested. Defaults to ??? + n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters. merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. @@ -280,7 +280,7 @@ class Tomato: else: weights = -numpy.log(weights) - if self.density_type_ == "KDE" or self.density_type_ == "logKDE": + if self.density_type_ in {"KDE", "logKDE"}: # FIXME: replace most assert with raise ValueError("blabla") assert input_type == "points" kde_params = self.params_.get("kde_params", dict()) -- cgit v1.2.3 From 5f9a63e440a7ffdcd9fde689b6eb3b3a6b76b9bd Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 21 Apr 2020 15:24:41 +0200 Subject: Compute prominence, not its opposite... --- src/python/gudhi/clustering/tomato.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 7b319b5c..f25dacd5 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -305,7 +305,7 @@ class Tomato: if self.__merge_threshold: assert not self.__n_clusters self.__n_clusters = numpy.count_nonzero( - self.diagram_[:, 1] - self.diagram_[:, 0] > self.__merge_threshold + self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold ) + len(self.max_density_per_cc_) if self.__n_clusters: renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) @@ -364,7 +364,7 @@ class Tomato: if merge_threshold == self.__merge_threshold: return if hasattr(self, "leaf_labels_"): - self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 1] - self.diagram_[:, 0] > merge_threshold) + len( + self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 0] - self.diagram_[:, 1] > merge_threshold) + len( self.max_density_per_cc_ ) else: -- cgit v1.2.3 From 6b387c03293ff90bf117f06a794c7d440848a9c4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 28 Apr 2020 20:33:20 +0200 Subject: plot_diagram when there are no finite points --- src/python/gudhi/clustering/tomato.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index f25dacd5..5f1f8e24 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -327,9 +327,20 @@ class Tomato: """ import matplotlib.pyplot as plt - plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") - l = self.diagram_[:, 1].min() - r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max()) + if self.diagram_.size > 0: + plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") + l = self.diagram_[:, 1].min() + r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max()) + else: + l = self.max_density_per_cc_.min() + r = self.max_density_per_cc_.max() + if l == r: + if l > 0: + l, r = .9 * l, 1.1 * r + elif l < 0: + l, r = 1.1 * l, .9 * r + else: + l, r = -1., 1. plt.plot([l, r], [l, r]) plt.plot( self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" -- cgit v1.2.3 From b84b3f006805eb69e03983301a550ddcb8050769 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 28 Apr 2020 20:55:57 +0200 Subject: Always save points_ --- src/python/gudhi/clustering/tomato.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 5f1f8e24..07006d7c 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -98,6 +98,8 @@ class Tomato: raise ValueError("If density_type is 'manual', you must provide weights to fit()") input_type = self.input_type_ + if input_type == "points": + self.points_ = X if input_type == "points" and self.metric_: from sklearn.metrics import pairwise_distances @@ -118,7 +120,6 @@ class Tomato: assert density_type == "manual" if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: - self.points_ = X q = self.params_.get("p_DTM", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) @@ -189,7 +190,6 @@ class Tomato: weights = -numpy.log(weights) if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: - self.points_ = X p = self.params_.get("p", 2) k = self.params_.get("k", 10) if self.params_.get("gpu"): @@ -218,7 +218,6 @@ class Tomato: _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: - self.points_ = X q = self.params_.get("p_DTM", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) @@ -275,7 +274,10 @@ class Tomato: k = self.params_["k"] weights = (numpy.partition(X, k - 1)[:, 0:k] ** q).sum(-1) if self.density_type_ == "DTM": - dim = len(self.points_[0]) + try: + dim = len(self.points_[0]) + except AttributeError: + dim = 2 weights = weights ** (-dim / q) else: weights = -numpy.log(weights) -- cgit v1.2.3 From 4c9c01ab794901210859e299c3528be8f26f5f27 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 28 Apr 2020 21:04:01 +0200 Subject: Unbreak KDE --- src/python/gudhi/clustering/tomato.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 07006d7c..88a1a34d 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -229,7 +229,6 @@ class Tomato: # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: - assert False # Not supported??? dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1) elif p == 2: # Any even integer? dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1) @@ -284,10 +283,10 @@ class Tomato: if self.density_type_ in {"KDE", "logKDE"}: # FIXME: replace most assert with raise ValueError("blabla") - assert input_type == "points" + # assert input_type == "points" kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity - weights = KernelDensity(**kde_params).fit(X).score_samples(X) + weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": weights = numpy.exp(weights) -- cgit v1.2.3 From 8ba3ca48e03e379fca0a0b68a508d8357a367f52 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 5 May 2020 20:58:54 +0200 Subject: Fix type of neighbors for "radius" --- src/python/gudhi/clustering/tomato.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 88a1a34d..1a2887bc 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -109,7 +109,7 @@ class Tomato: if input_type == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) r = self.params_["r"] - self.neighbors_ = [numpy.nonzero(l <= r) for l in X] + self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] if input_type == "distance_matrix" and self.graph_type_ == "knn": k = self.params_["k"] -- cgit v1.2.3 From 05415ffc3e1e8b00623de2088093d84e3030b0f1 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 16 May 2020 11:44:46 +0200 Subject: Redundant assert --- src/python/gudhi/clustering/tomato.py | 1 - 1 file changed, 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 1a2887bc..0a2d562b 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -93,7 +93,6 @@ class Tomato: density_type = "manual" else: density_type = self.density_type_ - assert density_type != "manual" if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") -- cgit v1.2.3 From 2b896ce68eb5cf99d698313ca0e9eea3b35a19c6 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 18 May 2020 22:15:07 +0200 Subject: Start refactoring of Tomato --- src/python/gudhi/clustering/tomato.py | 48 ++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 0a2d562b..fa462f8f 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -1,4 +1,6 @@ import numpy +from ..point_cloud.knn import KNearestNeighbors +from ..point_cloud.dtm import DTMDensity from ._tomato import * # The fit/predict interface is not so well suited... @@ -37,6 +39,7 @@ class Tomato: def __init__( self, + # FIXME: fold input_type into metric input_type="points", metric=None, graph_type="knn", @@ -65,7 +68,8 @@ class Tomato: merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. - p_DTM (float): order used to compute the distance to measure. Defaults to the dimension, or 2 if input_type is 'distance_matrix'. + q (float): order used to compute the distance to measure. Defaults to dim. Beware that when the dimension is large, this can easily cause overflows. + dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "neighbors" or "precomputed"). n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. """ # Should metric='precomputed' mean input_type='distance_matrix'? @@ -99,12 +103,48 @@ class Tomato: input_type = self.input_type_ if input_type == "points": self.points_ = X + + # FIXME: restrict this strongly if input_type == "points" and self.metric_: from sklearn.metrics import pairwise_distances X = pairwise_distances(X, metric=self.metric_, n_jobs=self.params_.get("n_jobs")) input_type = "distance_matrix" + need_knn = 0 + need_knn_ngb = False + need_knn_dist = False + if self.graph_type_ == "knn": + k_graph = self.params_["k"] + need_knn = k_graph + need_knn_ngb = True + if elf.density_type_ in ["DTM", "logDTM"]: + k = self.params_.get("k", 10) # FIXME: What if X has fewer than 10 points? + k_DTM = self.params_.get("k_DTM", k) + need_knn = max(need_knn, k_DTM) + need_knn_dist = True + # if we ask for more neighbors for the graph than the DTM, getting the distances is a slight waste, + # but it looks negligible + if need_knn > 0: + knn = KNearestNeighbors(need_knn, return_index=need_knn_ngb, return_distance=need_knn_dist, **self.params_).fit_transform(X) + if need_knn_ngb: + if need_knn_dist: + knn_ngb = knn[0][:, 0:k_graph] + knn_dist = knn[1] + else: + knn_ngb = knn + if need_knn_dist: + knn_dist = knn + if self.density_type_ in ["DTM", "logDTM"]: + if metric in ["neighbors", "precomputed"]: + dim = self.params_.get("dim", 2) + else: + dim = len(X) + q = self.params_.get("q", dim) + weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist) + if self.density_type_ == "logDTM": + weights = numpy.log(weights) + if input_type == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) r = self.params_["r"] @@ -119,7 +159,7 @@ class Tomato: assert density_type == "manual" if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("p_DTM", len(X[0])) + q = self.params_.get("q", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) k_DTM = self.params_.get("k_DTM", k) @@ -217,7 +257,7 @@ class Tomato: _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("p_DTM", len(X[0])) + q = self.params_.get("q", len(X[0])) p = self.params_.get("p", 2) k = self.params_.get("k", 10) k_DTM = self.params_.get("k_DTM", k) @@ -265,7 +305,7 @@ class Tomato: weights = -numpy.log(weights) if input_type == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("p_DTM", 2) + q = self.params_.get("q", 2) X = numpy.array(X) k = self.params_.get("k_DTM") if not k: -- cgit v1.2.3 From fe8c35e2716eca9d88e33133c64f5c2121535539 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 19 May 2020 17:33:52 +0200 Subject: continue --- src/python/gudhi/clustering/tomato.py | 171 +--------------------------------- 1 file changed, 3 insertions(+), 168 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index fa462f8f..a5e8bcf5 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -129,10 +129,10 @@ class Tomato: knn = KNearestNeighbors(need_knn, return_index=need_knn_ngb, return_distance=need_knn_dist, **self.params_).fit_transform(X) if need_knn_ngb: if need_knn_dist: - knn_ngb = knn[0][:, 0:k_graph] + self.neighbors_ = knn[0][:, 0:k_graph] knn_dist = knn[1] else: - knn_ngb = knn + self.neighbors_ = knn if need_knn_dist: knn_dist = knn if self.density_type_ in ["DTM", "logDTM"]: @@ -146,180 +146,15 @@ class Tomato: weights = numpy.log(weights) if input_type == "distance_matrix" and self.graph_type_ == "radius": + # TODO: parallelize X = numpy.array(X) r = self.params_["r"] self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] - if input_type == "distance_matrix" and self.graph_type_ == "knn": - k = self.params_["k"] - self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k] - if input_type == "neighbors": self.neighbors_ = X assert density_type == "manual" - if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("q", len(X[0])) - p = self.params_.get("p", 2) - k = self.params_.get("k", 10) - k_DTM = self.params_.get("k_DTM", k) - kmax = max(k, k_DTM) - if self.params_.get("gpu"): - import torch - from pykeops.torch import LazyTensor - - # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.tensor(self.points_, dtype=torch.float32) - if p == numpy.inf: - # Requires a version of pykeops strictly more recent than 1.3 - dd, nn = ( - (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) - .abs() - .max(-1) - .Kmin_argKmin(kmax, dim=1) - ) - elif p == 2: # Any even integer? - dd, nn = ( - ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p) - .sum(-1) - .Kmin_argKmin(kmax, dim=1) - ) - else: - dd, nn = ( - ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) - .sum(-1) - .Kmin_argKmin(kmax, dim=1) - ) - if k < kmax: - nn = nn[:, 0:k] - if k_DTM < kmax: - dd = dd[:, 0:k_DTM] - assert q != numpy.inf # for now - if p != numpy.inf: - qp = float(q) / p - else: - qp = q - if qp != 1: - dd = dd ** qp - weights = dd.sum(-1) - - # Back to the CPU. Not sure this is necessary, or the right way to do it. - weights = numpy.array(weights) - self.neighbors_ = numpy.array(nn) - else: # CPU - from scipy.spatial import cKDTree - - kdtree = cKDTree(self.points_) - qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} - dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs) - if k < kmax: - self.neighbors_ = self.neighbors_[:, 0:k] - if k_DTM < kmax: - dd = dd[:, 0:k_DTM] - # weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd ** q).sum(-1) - - if self.density_type_ == "DTM": - # We ignore constant factors, which don't matter for - # clustering, although they do change thresholds - dim = len(self.points_[0]) - weights = weights ** (-dim / q) - else: - # We ignore exponents, which become constant factors with log - weights = -numpy.log(weights) - - if input_type == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: - p = self.params_.get("p", 2) - k = self.params_.get("k", 10) - if self.params_.get("gpu"): - import torch - from pykeops.torch import LazyTensor - - # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.tensor(self.points_, dtype=torch.float32) - if p == numpy.inf: - # Requires a version of pykeops strictly more recent than 1.3 - nn = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).argKmin(k, dim=1) - elif p == 2: # Any even integer? - nn = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).argKmin(k, dim=1) - else: - nn = ( - ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p).sum(-1).argKmin(k, dim=1) - ) - # Back to the CPU. Not sure this is necessary, or the right way to do it. - self.neighbors_ = numpy.array(nn) - else: # CPU - from scipy.spatial import cKDTree - - kdtree = cKDTree(self.points_) - # FIXME 'p' - qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} - _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) - - if input_type == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("q", len(X[0])) - p = self.params_.get("p", 2) - k = self.params_.get("k", 10) - k_DTM = self.params_.get("k_DTM", k) - if self.params_.get("gpu"): - import torch - from pykeops.torch import LazyTensor - - # 'float64' is slow except on super expensive GPUs. Allow it with some param? - XX = torch.tensor(self.points_, dtype=torch.float32) - if p == numpy.inf: - dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1) - elif p == 2: # Any even integer? - dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1) - else: - dd = ( - ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) - .sum(-1) - .Kmin(k_DTM, dim=1) - ) - assert q != numpy.inf # for now - if p != numpy.inf: - qp = float(q) / p - else: - qp = q - if qp != 1: - dd = dd ** qp - weights = dd.sum(-1) - # **1/q is a waste of time, whether we take another **-.25 or a log - - # Back to the CPU. Not sure this is necessary, or the right way to do it. - weights = numpy.array(weights) - else: # CPU - from scipy.spatial import cKDTree - - kdtree = cKDTree(self.points_) - qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} - dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs) - # weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd ** q).sum(-1) - - if self.density_type_ == "DTM": - dim = len(self.points_[0]) - weights = weights ** (-dim / q) - else: - weights = -numpy.log(weights) - - if input_type == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: - q = self.params_.get("q", 2) - X = numpy.array(X) - k = self.params_.get("k_DTM") - if not k: - k = self.params_["k"] - weights = (numpy.partition(X, k - 1)[:, 0:k] ** q).sum(-1) - if self.density_type_ == "DTM": - try: - dim = len(self.points_[0]) - except AttributeError: - dim = 2 - weights = weights ** (-dim / q) - else: - weights = -numpy.log(weights) - if self.density_type_ in {"KDE", "logKDE"}: # FIXME: replace most assert with raise ValueError("blabla") # assert input_type == "points" -- cgit v1.2.3 From bee990ed74dec6d3257b89e1fcceb24b089d5c00 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 19 May 2020 20:16:32 +0200 Subject: long line --- src/python/gudhi/point_cloud/dtm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index d836c28d..55ac58e6 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -95,8 +95,9 @@ class DTMDensity: Only available for the Euclidean metric, defaults to False. n_samples (int): number of sample points used for fitting. Only needed if `normalize` is True and metric is "neighbors". - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that - :func:`transform` expects an array with the distances to the k nearest neighbors. + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that + metric="neighbors" means that :func:`transform` expects an array with the distances to + the k nearest neighbors. """ if weights is None: self.k = k -- cgit v1.2.3 From 8fff4339aab542c29e8672c720152198c6647615 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 21 May 2020 10:25:09 +0200 Subject: TODO --- src/python/gudhi/clustering/tomato.py | 1 + 1 file changed, 1 insertion(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index a5e8bcf5..5bfb9f68 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -164,6 +164,7 @@ class Tomato: if self.density_type_ == "KDE": weights = numpy.exp(weights) + # TODO: do it at the C++ level and/or in parallel if this is too slow if self.params_.get("symmetrize_graph"): self.neighbors_ = [set(line) for line in self.neighbors_] for i, line in enumerate(self.neighbors_): -- cgit v1.2.3 From 8cd34b4f0a6f5ffe8cfc16d2bb5856e5f6400216 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 21 May 2020 20:00:33 +0200 Subject: Random fixes --- src/python/gudhi/clustering/tomato.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 5bfb9f68..75296ab3 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -118,7 +118,7 @@ class Tomato: k_graph = self.params_["k"] need_knn = k_graph need_knn_ngb = True - if elf.density_type_ in ["DTM", "logDTM"]: + if self.density_type_ in ["DTM", "logDTM"]: k = self.params_.get("k", 10) # FIXME: What if X has fewer than 10 points? k_DTM = self.params_.get("k_DTM", k) need_knn = max(need_knn, k_DTM) @@ -126,20 +126,22 @@ class Tomato: # if we ask for more neighbors for the graph than the DTM, getting the distances is a slight waste, # but it looks negligible if need_knn > 0: - knn = KNearestNeighbors(need_knn, return_index=need_knn_ngb, return_distance=need_knn_dist, **self.params_).fit_transform(X) + knn_args = dict(self.params_) + knn_args["k"] = need_knn + knn = KNearestNeighbors(return_index=need_knn_ngb, return_distance=need_knn_dist, **knn_args).fit_transform(X) if need_knn_ngb: if need_knn_dist: self.neighbors_ = knn[0][:, 0:k_graph] knn_dist = knn[1] else: self.neighbors_ = knn - if need_knn_dist: + elif need_knn_dist: knn_dist = knn if self.density_type_ in ["DTM", "logDTM"]: - if metric in ["neighbors", "precomputed"]: + if self.metric_ in ["neighbors", "precomputed"]: dim = self.params_.get("dim", 2) else: - dim = len(X) + dim = len(X) # FIXME for distance matrix q = self.params_.get("q", dim) weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist) if self.density_type_ == "logDTM": -- cgit v1.2.3 From cdc289bb4b95943a01d9dc787255b5c3b8bd79ae Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Thu, 21 May 2020 23:50:12 +0200 Subject: no input_type --- src/python/gudhi/clustering/tomato.py | 74 +++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 30 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 75296ab3..a05b9d32 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -30,7 +30,7 @@ class Tomato: children_: ndarray of shape (n_leaves_-1,2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i params_: dict - Parameters like input_type, metric, etc + Parameters like metric, etc """ # Not documented for now, because I am not sure how useful it is. @@ -39,9 +39,7 @@ class Tomato: def __init__( self, - # FIXME: fold input_type into metric - input_type="points", - metric=None, + metric="minkowski", graph_type="knn", density_type="logDTM", n_clusters=None, @@ -54,27 +52,25 @@ class Tomato: Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. Args: - input_type (str): 'points', 'distance_matrix' or 'neighbors'. - metric (None|Callable): If None, use Minkowski of parameter p. - graph_type (str): 'manual', 'knn' or 'radius'. Ignored if input_type is 'neighbors'. + metric (str|Callable): Describes how to interpret the argument of `fit()`. Defaults to Minkowski of parameter p. + graph_type (str): 'manual', 'knn' or 'radius'. Ignored if metric is 'neighbors'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. r (float): size of a neighborhood if graph_type is 'radius'. - eps (float): approximation factor when computing nearest neighbors (currently ignored with a GPU). - gpu (bool): enable use of CUDA (through pykeops) to compute k nearest neighbors. This is useful when the dimension becomes large (10+) but the number of points remains low (less than a million). + eps (float): approximation factor when computing neighbors with a kd-tree (ignored in many cases). n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters. merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. - p (float): norm L^p on input points (numpy.inf is supported without gpu). Defaults to 2. + p (float): norm L^p on input points. Defaults to 2. q (float): order used to compute the distance to measure. Defaults to dim. Beware that when the dimension is large, this can easily cause overflows. dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "neighbors" or "precomputed"). - n_jobs (int): Number of jobs to schedule for parallel processing of nearest neighbors on the CPU. If -1 is given all processors are used. Default: 1. + n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. If -1 is given all processors are used. Default: 1. + params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and :class:`~gudhi.point_cloud.dtm.DTMDensity`. """ # Should metric='precomputed' mean input_type='distance_matrix'? # Should we be able to pass metric='minkowski' (what None does currently)? - self.input_type_ = input_type self.metric_ = metric self.graph_type_ = graph_type self.density_type_ = density_type @@ -86,9 +82,10 @@ class Tomato: raise ValueError("Cannot specify both a merge threshold and a number of clusters") def fit(self, X, y=None, weights=None): + #FIXME: Iterable -> Sequence? """ Args: - X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance_matrix (full, not just a triangle), or list of neighbors for each point (points are represented by their index, starting from 0), according to input_type. + X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if metric is "neighbors". weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) @@ -100,22 +97,22 @@ class Tomato: if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") - input_type = self.input_type_ - if input_type == "points": + metric = self.metric_ + if metric not in ["precomputed", "neighbors"]: self.points_ = X # FIXME: restrict this strongly - if input_type == "points" and self.metric_: + if metric not in ["precomputed", "neighbors", "minkowski", "euclidean"]: from sklearn.metrics import pairwise_distances - X = pairwise_distances(X, metric=self.metric_, n_jobs=self.params_.get("n_jobs")) - input_type = "distance_matrix" + X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs")) + metric = "precomputed" need_knn = 0 need_knn_ngb = False need_knn_dist = False if self.graph_type_ == "knn": - k_graph = self.params_["k"] + k_graph = self.params_.get("k", 10) # FIXME: What if X has fewer than 10 points? need_knn = k_graph need_knn_ngb = True if self.density_type_ in ["DTM", "logDTM"]: @@ -138,35 +135,53 @@ class Tomato: elif need_knn_dist: knn_dist = knn if self.density_type_ in ["DTM", "logDTM"]: - if self.metric_ in ["neighbors", "precomputed"]: + if metric in ["neighbors", "precomputed"]: dim = self.params_.get("dim", 2) else: - dim = len(X) # FIXME for distance matrix + dim = len(X) q = self.params_.get("q", dim) weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist) if self.density_type_ == "logDTM": weights = numpy.log(weights) - if input_type == "distance_matrix" and self.graph_type_ == "radius": - # TODO: parallelize - X = numpy.array(X) + if metric == "precomputed" and self.graph_type_ == "radius": + # TODO: parallelize? + X = numpy.asarray(X) r = self.params_["r"] self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] - if input_type == "neighbors": + if self.graph_type_ == "radius" and metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: + from scipy.spatial import cKDTree + tree t = cKDTree(X) + # TODO: handle "l1" and "l2" aliases? + p = self.params_.get("p") + if metric == "euclidean": + assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'" + p = 2 + elif metric == "manhattan": + assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'" + p = 1 + elif metric == "chebyshev": + assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'" + p = numpy.inf + elif p is None: + p = 2 # the default + eps = self.params_.get("eps", 0) + self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) + + if metric == "neighbors": self.neighbors_ = X assert density_type == "manual" if self.density_type_ in {"KDE", "logKDE"}: - # FIXME: replace most assert with raise ValueError("blabla") - # assert input_type == "points" + assert metric not in ["neighbors", "precomputed"], "Scikit-learn's KernelDensity requires point coordinates" kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": weights = numpy.exp(weights) - # TODO: do it at the C++ level and/or in parallel if this is too slow + # TODO: do it at the C++ level and/or in parallel if this is too slow? if self.params_.get("symmetrize_graph"): self.neighbors_ = [set(line) for line in self.neighbors_] for i, line in enumerate(self.neighbors_): @@ -267,8 +282,7 @@ if __name__ == "__main__": a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] a = numpy.random.rand(500, 2) t = Tomato( - input_type="points", - metric="Euclidean", + metric="euclidean", graph_type="knn", density_type="DTM", n_clusters=2, -- cgit v1.2.3 From 753198e6d70e761df0c92617dfef7b338de4ba82 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sat, 23 May 2020 23:53:13 +0200 Subject: Remove metric="neighbors" --- src/python/gudhi/clustering/tomato.py | 87 ++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index a05b9d32..1c586f4f 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -39,9 +39,9 @@ class Tomato: def __init__( self, - metric="minkowski", graph_type="knn", density_type="logDTM", + metric="minkowski", n_clusters=None, merge_threshold=None, # eliminate_threshold=None, @@ -53,19 +53,19 @@ class Tomato: Args: metric (str|Callable): Describes how to interpret the argument of `fit()`. Defaults to Minkowski of parameter p. - graph_type (str): 'manual', 'knn' or 'radius'. Ignored if metric is 'neighbors'. + graph_type (str): 'manual', 'knn' or 'radius'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. r (float): size of a neighborhood if graph_type is 'radius'. - eps (float): approximation factor when computing neighbors with a kd-tree (ignored in many cases). + eps (float): (1+eps) approximation factor when computing distances (ignored in many cases). n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters. merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points. Defaults to 2. q (float): order used to compute the distance to measure. Defaults to dim. Beware that when the dimension is large, this can easily cause overflows. - dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "neighbors" or "precomputed"). + dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "precomputed"). n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. If -1 is given all processors are used. Default: 1. params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and :class:`~gudhi.point_cloud.dtm.DTMDensity`. """ @@ -85,7 +85,7 @@ class Tomato: #FIXME: Iterable -> Sequence? """ Args: - X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if metric is "neighbors". + X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) @@ -97,17 +97,16 @@ class Tomato: if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") - metric = self.metric_ - if metric not in ["precomputed", "neighbors"]: - self.points_ = X - - # FIXME: restrict this strongly - if metric not in ["precomputed", "neighbors", "minkowski", "euclidean"]: - from sklearn.metrics import pairwise_distances - - X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs")) - metric = "precomputed" + if self.graph_type_ == "manual": + self.neighbors_ = X + # FIXME: uniformize "message 'option'" vs 'message "option"' + assert density_type == "manual", 'If graph_type is "manual", density_type must be as well' + else: + metric = self.metric_ + if metric != "precomputed": + self.points_ = X + # Slight complication to avoid computing knn twice. need_knn = 0 need_knn_ngb = False need_knn_dist = False @@ -135,7 +134,7 @@ class Tomato: elif need_knn_dist: knn_dist = knn if self.density_type_ in ["DTM", "logDTM"]: - if metric in ["neighbors", "precomputed"]: + if metric == "precomputed": dim = self.params_.get("dim", 2) else: dim = len(X) @@ -144,34 +143,37 @@ class Tomato: if self.density_type_ == "logDTM": weights = numpy.log(weights) - if metric == "precomputed" and self.graph_type_ == "radius": - # TODO: parallelize? - X = numpy.asarray(X) - r = self.params_["r"] - self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] + if self.graph_type_ == "radius": + if metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: + from scipy.spatial import cKDTree + tree t = cKDTree(X) + # TODO: handle "l1" and "l2" aliases? + p = self.params_.get("p") + if metric == "euclidean": + assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'" + p = 2 + elif metric == "manhattan": + assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'" + p = 1 + elif metric == "chebyshev": + assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'" + p = numpy.inf + elif p is None: + p = 2 # the default + eps = self.params_.get("eps", 0) + self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) - if self.graph_type_ == "radius" and metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: - from scipy.spatial import cKDTree - tree t = cKDTree(X) - # TODO: handle "l1" and "l2" aliases? - p = self.params_.get("p") - if metric == "euclidean": - assert p is None or p == 2, "p=" + str(p) + " is not consistent with metric='euclidean'" - p = 2 - elif metric == "manhattan": - assert p is None or p == 1, "p=" + str(p) + " is not consistent with metric='manhattan'" - p = 1 - elif metric == "chebyshev": - assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'" - p = numpy.inf - elif p is None: - p = 2 # the default - eps = self.params_.get("eps", 0) - self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) + elif metric != "precomputed": + from sklearn.metrics import pairwise_distances - if metric == "neighbors": - self.neighbors_ = X - assert density_type == "manual" + X = pairwise_distances(X, metric=metric, n_jobs=self.params_.get("n_jobs")) + metric = "precomputed" + + if metric == "precomputed": + # TODO: parallelize? May not be worth it. + X = numpy.asarray(X) + r = self.params_["r"] + self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] if self.density_type_ in {"KDE", "logKDE"}: assert metric not in ["neighbors", "precomputed"], "Scikit-learn's KernelDensity requires point coordinates" @@ -201,6 +203,7 @@ class Tomato: self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold ) + len(self.max_density_per_cc_) if self.__n_clusters: + # TODO: set corresponding merge_threshold? renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] else: -- cgit v1.2.3 From 9ff9055a93b5bc5c402519bd0bc8c85bf97d6d84 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Sun, 24 May 2020 22:40:44 +0200 Subject: Move metric to params --- src/python/gudhi/clustering/tomato.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 1c586f4f..29f30481 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -41,7 +41,6 @@ class Tomato: self, graph_type="knn", density_type="logDTM", - metric="minkowski", n_clusters=None, merge_threshold=None, # eliminate_threshold=None, @@ -52,9 +51,9 @@ class Tomato: Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. Args: - metric (str|Callable): Describes how to interpret the argument of `fit()`. Defaults to Minkowski of parameter p. graph_type (str): 'manual', 'knn' or 'radius'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. + metric (str|Callable): metric used when calculating the distance between instances in a feature array. Defaults to Minkowski of parameter p. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. @@ -71,7 +70,6 @@ class Tomato: """ # Should metric='precomputed' mean input_type='distance_matrix'? # Should we be able to pass metric='minkowski' (what None does currently)? - self.metric_ = metric self.graph_type_ = graph_type self.density_type_ = density_type self.params_ = params @@ -102,7 +100,7 @@ class Tomato: # FIXME: uniformize "message 'option'" vs 'message "option"' assert density_type == "manual", 'If graph_type is "manual", density_type must be as well' else: - metric = self.metric_ + metric = self.params_.get("metric", "minkowski") if metric != "precomputed": self.points_ = X @@ -163,6 +161,7 @@ class Tomato: eps = self.params_.get("eps", 0) self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) + # TODO: sklearn's NearestNeighbors can handle more metrics efficiently via its BallTree elif metric != "precomputed": from sklearn.metrics import pairwise_distances @@ -176,8 +175,9 @@ class Tomato: self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] if self.density_type_ in {"KDE", "logKDE"}: - assert metric not in ["neighbors", "precomputed"], "Scikit-learn's KernelDensity requires point coordinates" + assert graph_type != "manual" and metric != "precomputed", "Scikit-learn's KernelDensity requires point coordinates" kde_params = self.params_.get("kde_params", dict()) + kde_params.setdefault("metric", metric) from sklearn.neighbors import KernelDensity weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": -- cgit v1.2.3 From 5be0f973261ce3999097923b573bbf63ec3a08f0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 17:16:06 +0200 Subject: comments --- src/python/gudhi/clustering/tomato.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 29f30481..18425700 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -161,7 +161,7 @@ class Tomato: eps = self.params_.get("eps", 0) self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) - # TODO: sklearn's NearestNeighbors can handle more metrics efficiently via its BallTree + # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree (don't bother with the _graph variant, it just calls radius_neighbors). elif metric != "precomputed": from sklearn.metrics import pairwise_distances @@ -230,6 +230,7 @@ class Tomato: else: l = self.max_density_per_cc_.min() r = self.max_density_per_cc_.max() + #FIXME: move this out of the else, for diagrams with one point on the diagonal and an infinite point with the same coordinate? if l == r: if l > 0: l, r = .9 * l, 1.1 * r -- cgit v1.2.3 From 8c122a8c92285dd89844720c9cf04d001db491d0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 17:32:17 +0200 Subject: bugs --- src/python/gudhi/clustering/tomato.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 18425700..2b4d9242 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -132,10 +132,9 @@ class Tomato: elif need_knn_dist: knn_dist = knn if self.density_type_ in ["DTM", "logDTM"]: - if metric == "precomputed": - dim = self.params_.get("dim", 2) - else: - dim = len(X) + dim = self.params_.get("dim") + if dim is None: + dim = len(X[0]) if metric != "precomputed" else 2 q = self.params_.get("q", dim) weights = DTMDensity(k=k_DTM, metric="neighbors", dim=dim, q=q).fit_transform(knn_dist) if self.density_type_ == "logDTM": @@ -144,7 +143,7 @@ class Tomato: if self.graph_type_ == "radius": if metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: from scipy.spatial import cKDTree - tree t = cKDTree(X) + tree = cKDTree(X) # TODO: handle "l1" and "l2" aliases? p = self.params_.get("p") if metric == "euclidean": @@ -159,7 +158,7 @@ class Tomato: elif p is None: p = 2 # the default eps = self.params_.get("eps", 0) - self.neighbors_ = t.query_ball_tree(t, r=self.params_["r"], p=p, eps=eps) + self.neighbors_ = tree.query_ball_tree(tree, r=self.params_["r"], p=p, eps=eps) # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree (don't bother with the _graph variant, it just calls radius_neighbors). elif metric != "precomputed": -- cgit v1.2.3 From ee56ee7814367c8b7437c8a8d9a0be32877c3196 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 17:45:22 +0200 Subject: plotting almost empty diagrams --- src/python/gudhi/clustering/tomato.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 2b4d9242..824b5544 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -222,21 +222,19 @@ class Tomato: """ import matplotlib.pyplot as plt + l = self.max_density_per_cc_.min() + r = self.max_density_per_cc_.max() if self.diagram_.size > 0: plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") - l = self.diagram_[:, 1].min() - r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max()) - else: - l = self.max_density_per_cc_.min() - r = self.max_density_per_cc_.max() - #FIXME: move this out of the else, for diagrams with one point on the diagonal and an infinite point with the same coordinate? - if l == r: - if l > 0: - l, r = .9 * l, 1.1 * r - elif l < 0: - l, r = 1.1 * l, .9 * r - else: - l, r = -1., 1. + l = min(l, self.diagram_[:, 1].min()) + r = max(r, self.diagram_[:, 0].max()) + if l == r: + if l > 0: + l, r = .9 * l, 1.1 * r + elif l < 0: + l, r = 1.1 * l, .9 * r + else: + l, r = -1., 1. plt.plot([l, r], [l, r]) plt.plot( self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" -- cgit v1.2.3 From c6bc508fe2f101f37fb7e1a940f3869122f7da71 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 18:19:05 +0200 Subject: Use radius as default KDE bandwidth --- src/python/gudhi/clustering/tomato.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 824b5544..7e67c7fd 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -57,7 +57,7 @@ class Tomato: kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. - r (float): size of a neighborhood if graph_type is 'radius'. + r (float): size of a neighborhood if graph_type is 'radius'. Also used as default bandwidth in kde_params. eps (float): (1+eps) approximation factor when computing distances (ignored in many cases). n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters. merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. @@ -174,9 +174,12 @@ class Tomato: self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] if self.density_type_ in {"KDE", "logKDE"}: - assert graph_type != "manual" and metric != "precomputed", "Scikit-learn's KernelDensity requires point coordinates" - kde_params = self.params_.get("kde_params", dict()) + assert self.graph_type_ != "manual" and metric != "precomputed", "Scikit-learn's KernelDensity requires point coordinates" + kde_params = dict(self.params_.get("kde_params", dict())) kde_params.setdefault("metric", metric) + r = self.params_.get("r") + if r is not None: + kde_params.setdefault("bandwidth", r) from sklearn.neighbors import KernelDensity weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": -- cgit v1.2.3 From fc7da6849c40cc0caef0e86e452f6d1e2c8320d0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 18:30:56 +0200 Subject: handle n len(X): + k_graph = len(X) need_knn = k_graph need_knn_ngb = True if self.density_type_ in ["DTM", "logDTM"]: - k = self.params_.get("k", 10) # FIXME: What if X has fewer than 10 points? + k = self.params_.get("k", 10) k_DTM = self.params_.get("k_DTM", k) + # If X has fewer than k points... + if k_DTM > len(X): + k_DTM = len(X) need_knn = max(need_knn, k_DTM) need_knn_dist = True # if we ask for more neighbors for the graph than the DTM, getting the distances is a slight waste, -- cgit v1.2.3 From 87a142db9e133fbd8f08d9bcc70a51e2a907aa35 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 18:53:08 +0200 Subject: Document attribute weights_ --- src/python/doc/clustering.rst | 2 +- src/python/gudhi/clustering/tomato.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/doc/clustering.rst b/src/python/doc/clustering.rst index dc9de968..1e933dc9 100644 --- a/src/python/doc/clustering.rst +++ b/src/python/doc/clustering.rst @@ -55,7 +55,7 @@ Makes the number of clusters clearer, and changes a bit the shape of the cluster .. image:: img/spiral-diag2.png -A quick look at the corresponding density estimate (`weights_` is not officially supported) +A quick look at the corresponding density estimate .. code-block:: diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index fcb4b234..c4da9deb 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -4,8 +4,7 @@ from ..point_cloud.dtm import DTMDensity from ._tomato import * # The fit/predict interface is not so well suited... -# TODO: option for a faster, weaker (probabilistic) knn - +# FIXME: choose if they are called weight, density, filtration, etc and be consistent. class Tomato: """ @@ -21,14 +20,16 @@ class Tomato: minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_. n_leaves_: int number of leaves (unstable clusters) in the hierarchical tree - leaf_labels_: ndarray of shape (n_samples) + leaf_labels_: ndarray of shape (n_samples,) cluster labels for each point, at the very bottom of the hierarchy - labels_: ndarray of shape (n_samples) + labels_: ndarray of shape (n_samples,) cluster labels for each point, after merging diagram_: ndarray of shape (n_leaves_,2) persistence diagram (only the finite points) children_: ndarray of shape (n_leaves_-1,2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i + weights_: ndarray of shape (n_samples,) + weights of the points, as computed by the density estimator or provided by the user params_: dict Parameters like metric, etc """ @@ -180,12 +181,14 @@ class Tomato: self.neighbors_ = [numpy.flatnonzero(l <= r) for l in X] if self.density_type_ in {"KDE", "logKDE"}: + # Slow... assert self.graph_type_ != "manual" and metric != "precomputed", "Scikit-learn's KernelDensity requires point coordinates" kde_params = dict(self.params_.get("kde_params", dict())) kde_params.setdefault("metric", metric) r = self.params_.get("r") if r is not None: kde_params.setdefault("bandwidth", r) + # Should we default rtol to eps? from sklearn.neighbors import KernelDensity weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": @@ -199,7 +202,7 @@ class Tomato: for j in line: self.neighbors_[j].add(i) - self.weights_ = weights # TODO remove + self.weights_ = weights self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = doit( list(self.neighbors_), weights ) -- cgit v1.2.3 From 9a7fba8b3dcfbd838ce2ea571fd4e8f06cd8a7bd Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 19:36:41 +0200 Subject: Rename and document max_weight_per_cc_ --- src/python/gudhi/clustering/tomato.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index c4da9deb..7e97819b 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -26,6 +26,8 @@ class Tomato: cluster labels for each point, after merging diagram_: ndarray of shape (n_leaves_,2) persistence diagram (only the finite points) + max_weight_per_cc_: ndarray of shape (n_connected_components,) + maximum of the density function on each connected component. This corresponds to the abscissa of infinite points in the diagram children_: ndarray of shape (n_leaves_-1,2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i weights_: ndarray of shape (n_samples,) @@ -34,10 +36,6 @@ class Tomato: Parameters like metric, etc """ - # Not documented for now, because I am not sure how useful it is. - # max_density_per_cc_: ndarray of shape (n_connected_components) - # maximum of the density function on each connected component - def __init__( self, graph_type="knn", @@ -203,16 +201,16 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights - self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = doit( + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit( list(self.neighbors_), weights ) - self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) - assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) + self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) + assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) if self.__merge_threshold: assert not self.__n_clusters self.__n_clusters = numpy.count_nonzero( self.diagram_[:, 0] - self.diagram_[:, 1] > self.__merge_threshold - ) + len(self.max_density_per_cc_) + ) + len(self.max_weight_per_cc_) if self.__n_clusters: # TODO: set corresponding merge_threshold? renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) @@ -234,8 +232,8 @@ class Tomato: """ import matplotlib.pyplot as plt - l = self.max_density_per_cc_.min() - r = self.max_density_per_cc_.max() + l = self.max_weight_per_cc_.min() + r = self.max_weight_per_cc_.max() if self.diagram_.size > 0: plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") l = min(l, self.diagram_[:, 1].min()) @@ -249,7 +247,7 @@ class Tomato: l, r = -1., 1. plt.plot([l, r], [l, r]) plt.plot( - self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" + self.max_weight_per_cc_, numpy.full(self.max_weight_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" ) plt.show() @@ -282,7 +280,7 @@ class Tomato: return if hasattr(self, "leaf_labels_"): self.n_clusters_ = numpy.count_nonzero(self.diagram_[:, 0] - self.diagram_[:, 1] > merge_threshold) + len( - self.max_density_per_cc_ + self.max_weight_per_cc_ ) else: self.__n_clusters = None @@ -310,7 +308,7 @@ if __name__ == "__main__": # print() # print("diagram\n",t.diagram_) # print() - print("max\n", t.max_density_per_cc_, file=sys.stderr) + print("max\n", t.max_weight_per_cc_, file=sys.stderr) # print() print("leaf labels\n", t.leaf_labels_) # print() -- cgit v1.2.3 From c6f519abe3d2fe424d9982ad8139ff2a86119bca Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 20:55:06 +0200 Subject: Make specifying an impossible number of clusters a warning --- src/python/gudhi/clustering/_tomato.cc | 18 ++++++++---------- src/python/gudhi/clustering/tomato.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 87bd62e9..638e1259 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -195,21 +195,19 @@ auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& d } auto merge(py::array_t children, Cluster_index n_leaves, Cluster_index n_final) { - // Should this really be an error? - if (n_final > n_leaves) - throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); + if (n_final > n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is larger than the number of mini-clusters " << n_leaves << '\n'; + n_final = n_leaves; // or return something special and let Tomato use leaf_labels_? + } py::buffer_info cbuf = children.request(); if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0)) throw std::runtime_error("internal error: children have to be (n,2) or empty"); const int n_merges = cbuf.shape[0]; Cluster_index* d = (Cluster_index*)cbuf.ptr; - // Should this really be an error? - // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; - if (n_merges + n_final < n_leaves) - throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) + - " is smaller than the number of connected components " + - std::to_string(n_leaves - n_merges)); - + if (n_merges + n_final < n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is smaller than the number of connected components " << n_leaves - n_merges << '\n'; + n_final = n_leaves - n_merges; + } struct Dat { Cluster_index parent; int rank; diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 7e97819b..867c46a1 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -24,11 +24,11 @@ class Tomato: cluster labels for each point, at the very bottom of the hierarchy labels_: ndarray of shape (n_samples,) cluster labels for each point, after merging - diagram_: ndarray of shape (n_leaves_,2) + diagram_: ndarray of shape (n_leaves_, 2) persistence diagram (only the finite points) max_weight_per_cc_: ndarray of shape (n_connected_components,) maximum of the density function on each connected component. This corresponds to the abscissa of infinite points in the diagram - children_: ndarray of shape (n_leaves_-1,2) + children_: ndarray of shape (n_leaves_-n_connected_components, 2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i weights_: ndarray of shape (n_samples,) weights of the points, as computed by the density estimator or provided by the user @@ -206,6 +206,7 @@ class Tomato: ) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) + # TODO: deduplicate this code with the setters below if self.__merge_threshold: assert not self.__n_clusters self.__n_clusters = numpy.count_nonzero( @@ -215,6 +216,9 @@ class Tomato: # TODO: set corresponding merge_threshold? renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible. + # TODO: check for impossible situations before calling merge. + self.__n_clusters = self.labels_.max() + 1 else: self.labels_ = self.leaf_labels_ self.__n_clusters = self.n_leaves_ @@ -269,6 +273,8 @@ class Tomato: if hasattr(self, "leaf_labels_"): renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible + self.__n_clusters = self.labels_.max() + 1 @property def merge_threshold_(self): -- cgit v1.2.3 From caa7c97d812acc3559aaecedc6e44e5f41d8a6af Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 21:01:03 +0200 Subject: indent --- src/python/gudhi/clustering/tomato.py | 76 ++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 867c46a1..55b64c1d 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -6,11 +6,14 @@ from ._tomato import * # The fit/predict interface is not so well suited... # FIXME: choose if they are called weight, density, filtration, etc and be consistent. + class Tomato: """ Clustering - This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet. + This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. + A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural + to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet. Attributes ---------- @@ -27,9 +30,12 @@ class Tomato: diagram_: ndarray of shape (n_leaves_, 2) persistence diagram (only the finite points) max_weight_per_cc_: ndarray of shape (n_connected_components,) - maximum of the density function on each connected component. This corresponds to the abscissa of infinite points in the diagram + maximum of the density function on each connected component. This corresponds to the abscissa of infinite + points in the diagram children_: ndarray of shape (n_leaves_-n_connected_components, 2) - The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i + The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. + A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. + Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i weights_: ndarray of shape (n_samples,) weights of the points, as computed by the density estimator or provided by the user params_: dict @@ -52,20 +58,29 @@ class Tomato: Args: graph_type (str): 'manual', 'knn' or 'radius'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. - metric (str|Callable): metric used when calculating the distance between instances in a feature array. Defaults to Minkowski of parameter p. - kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to sklearn.neighbors.KernelDensity. + metric (str|Callable): metric used when calculating the distance between instances in a feature array. + Defaults to Minkowski of parameter p. + kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to + sklearn.neighbors.KernelDensity. k (int): number of neighbors for a knn graph (including the vertex itself). Defaults to 10. - k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). Defaults to k. + k_DTM (int): number of neighbors for the DTM density estimation (including the vertex itself). + Defaults to k. r (float): size of a neighborhood if graph_type is 'radius'. Also used as default bandwidth in kde_params. eps (float): (1+eps) approximation factor when computing distances (ignored in many cases). - n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get the maximal number of clusters. + n_clusters (int): number of clusters requested. Defaults to None, i.e. no merging occurs and we get + the maximal number of clusters. merge_threshold (float): minimum prominence of a cluster so it doesn't get merged. - symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. This can be useful with k-NN for small k. Defaults to false. + symmetrize_graph (bool): whether we should add edges to make the neighborhood graph symmetric. + This can be useful with k-NN for small k. Defaults to false. p (float): norm L^p on input points. Defaults to 2. - q (float): order used to compute the distance to measure. Defaults to dim. Beware that when the dimension is large, this can easily cause overflows. - dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the dimension, or 2 when the dimension cannot be read from the input (metric is "precomputed"). - n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. If -1 is given all processors are used. Default: 1. - params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and :class:`~gudhi.point_cloud.dtm.DTMDensity`. + q (float): order used to compute the distance to measure. Defaults to dim. + Beware that when the dimension is large, this can easily cause overflows. + dim (float): final exponent in DTM density estimation, representing the dimension. Defaults to the + dimension, or 2 when the dimension cannot be read from the input (metric is "precomputed"). + n_jobs (int): Number of jobs to schedule for parallel processing on the CPU. + If -1 is given all processors are used. Default: 1. + params: extra parameters are passed to :class:`~gudhi.point_cloud.knn.KNearestNeighbors` and + :class:`~gudhi.point_cloud.dtm.DTMDensity`. """ # Should metric='precomputed' mean input_type='distance_matrix'? # Should we be able to pass metric='minkowski' (what None does currently)? @@ -79,7 +94,7 @@ class Tomato: raise ValueError("Cannot specify both a merge threshold and a number of clusters") def fit(self, X, y=None, weights=None): - #FIXME: Iterable -> Sequence? + # FIXME: Iterable -> Sequence? """ Args: X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". @@ -127,7 +142,9 @@ class Tomato: if need_knn > 0: knn_args = dict(self.params_) knn_args["k"] = need_knn - knn = KNearestNeighbors(return_index=need_knn_ngb, return_distance=need_knn_dist, **knn_args).fit_transform(X) + knn = KNearestNeighbors(return_index=need_knn_ngb, return_distance=need_knn_dist, **knn_args).fit_transform( + X + ) if need_knn_ngb: if need_knn_dist: self.neighbors_ = knn[0][:, 0:k_graph] @@ -148,6 +165,7 @@ class Tomato: if self.graph_type_ == "radius": if metric in ["minkowski", "euclidean", "manhattan", "chebyshev"]: from scipy.spatial import cKDTree + tree = cKDTree(X) # TODO: handle "l1" and "l2" aliases? p = self.params_.get("p") @@ -161,11 +179,12 @@ class Tomato: assert p is None or p == numpy.inf, "p=" + str(p) + " is not consistent with metric='chebyshev'" p = numpy.inf elif p is None: - p = 2 # the default + p = 2 # the default eps = self.params_.get("eps", 0) self.neighbors_ = tree.query_ball_tree(tree, r=self.params_["r"], p=p, eps=eps) - # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree (don't bother with the _graph variant, it just calls radius_neighbors). + # TODO: sklearn's NearestNeighbors.radius_neighbors can handle more metrics efficiently via its BallTree + # (don't bother with the _graph variant, it just calls radius_neighbors). elif metric != "precomputed": from sklearn.metrics import pairwise_distances @@ -180,7 +199,9 @@ class Tomato: if self.density_type_ in {"KDE", "logKDE"}: # Slow... - assert self.graph_type_ != "manual" and metric != "precomputed", "Scikit-learn's KernelDensity requires point coordinates" + assert ( + self.graph_type_ != "manual" and metric != "precomputed" + ), "Scikit-learn's KernelDensity requires point coordinates" kde_params = dict(self.params_.get("kde_params", dict())) kde_params.setdefault("metric", metric) r = self.params_.get("r") @@ -188,6 +209,7 @@ class Tomato: kde_params.setdefault("bandwidth", r) # Should we default rtol to eps? from sklearn.neighbors import KernelDensity + weights = KernelDensity(**kde_params).fit(self.points_).score_samples(self.points_) if self.density_type_ == "KDE": weights = numpy.exp(weights) @@ -201,9 +223,7 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights - self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit( - list(self.neighbors_), weights - ) + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit(list(self.neighbors_), weights) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) # TODO: deduplicate this code with the setters below @@ -244,11 +264,11 @@ class Tomato: r = max(r, self.diagram_[:, 0].max()) if l == r: if l > 0: - l, r = .9 * l, 1.1 * r + l, r = 0.9 * l, 1.1 * r elif l < 0: - l, r = 1.1 * l, .9 * r + l, r = 1.1 * l, 0.9 * r else: - l, r = -1., 1. + l, r = -1.0, 1.0 plt.plot([l, r], [l, r]) plt.plot( self.max_weight_per_cc_, numpy.full(self.max_weight_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" @@ -298,15 +318,7 @@ if __name__ == "__main__": a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] a = numpy.random.rand(500, 2) - t = Tomato( - metric="euclidean", - graph_type="knn", - density_type="DTM", - n_clusters=2, - k=4, - n_jobs=-1, - eps=0.05, - ) + t = Tomato(metric="euclidean", graph_type="knn", density_type="DTM", n_clusters=2, k=4, n_jobs=-1, eps=0.05,) t.fit(a) # print("neighbors\n",t.neighbors_) # print() -- cgit v1.2.3 From 2fb0d594060958804239fcdad5336832ea5133d0 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 22:56:04 +0200 Subject: Add test --- src/python/gudhi/clustering/tomato.py | 18 ++++++++---------- src/python/test/test_tomato.py | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) create mode 100755 src/python/test/test_tomato.py (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 55b64c1d..e3eaa300 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -18,24 +18,24 @@ class Tomato: Attributes ---------- n_clusters_: int - The number of clusters. Writing to it automatically adjusts labels_. + The number of clusters. Writing to it automatically adjusts `labels_`. merge_threshold_: float - minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts labels_. + minimum prominence of a cluster so it doesn't get merged. Writing to it automatically adjusts `labels_`. n_leaves_: int number of leaves (unstable clusters) in the hierarchical tree leaf_labels_: ndarray of shape (n_samples,) cluster labels for each point, at the very bottom of the hierarchy labels_: ndarray of shape (n_samples,) cluster labels for each point, after merging - diagram_: ndarray of shape (n_leaves_, 2) + diagram_: ndarray of shape (`n_leaves_`, 2) persistence diagram (only the finite points) max_weight_per_cc_: ndarray of shape (n_connected_components,) maximum of the density function on each connected component. This corresponds to the abscissa of infinite points in the diagram - children_: ndarray of shape (n_leaves_-n_connected_components, 2) - The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. - A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. - Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i + children_: ndarray of shape (`n_leaves_`-n_connected_components, 2) + The children of each non-leaf node. Values less than `n_leaves_` correspond to leaves of the tree. + A node i greater than or equal to `n_leaves_` is a non-leaf node and has children children_[i - `n_leaves_`]. + Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_leaves_` + i weights_: ndarray of shape (n_samples,) weights of the points, as computed by the density estimator or provided by the user params_: dict @@ -53,8 +53,6 @@ class Tomato: **params ): """ - Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. - Args: graph_type (str): 'manual', 'knn' or 'radius'. density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. @@ -223,7 +221,7 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights - self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit(list(self.neighbors_), weights) + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit(self.neighbors_, weights) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) # TODO: deduplicate this code with the setters below diff --git a/src/python/test/test_tomato.py b/src/python/test/test_tomato.py new file mode 100755 index 00000000..0a33b86e --- /dev/null +++ b/src/python/test/test_tomato.py @@ -0,0 +1,27 @@ +""" This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + Author(s): Marc Glisse + + Copyright (C) 2020 Inria + + Modification(s): + - YYYY/MM Author: Description of the modification +""" + +from gudhi.clustering.tomato import Tomato +import numpy as np +import pytest + + +def test_tomato_something(): + a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] + t = Tomato(metric="euclidean", n_clusters=2, k=4, n_jobs=-1, eps=0.05) + assert np.array_equal(t.fit_predict(a), [1,1,1,0,0,0,0]) # or with swapped 0 and 1 + + t = Tomato(density_type='KDE', r=1, k=4) + t.fit(a) + assert np.array_equal(t.leaf_labels_, [1,1,1,0,0,0,0]) # or with swapped 0 and 1 + + t = Tomato(graph_type='radius', r=4.7, k=4) + t.fit(a) + assert t.max_weight_per_cc_.size == 2 -- cgit v1.2.3 From cd623dcce7612723acca7f1a6f0b3ca4c87a4fb9 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 23:20:17 +0200 Subject: doc --- src/python/gudhi/clustering/tomato.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index e3eaa300..a3e304dc 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -9,11 +9,9 @@ from ._tomato import * class Tomato: """ - Clustering - This clustering algorithm needs a neighborhood graph on the points, and an estimation of the density at each point. A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural - to provide your own. In particular, we do not provide anything specific to cluster pixels on images yet. + to provide your own. Attributes ---------- @@ -92,10 +90,9 @@ class Tomato: raise ValueError("Cannot specify both a merge threshold and a number of clusters") def fit(self, X, y=None, weights=None): - # FIXME: Iterable -> Sequence? """ Args: - X ((n,d)-array of float|(n,n)-array of float|Iterable[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". + X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point """ # TODO: First detect if this is a new call with the same data (only threshold changed?) -- cgit v1.2.3 From c2cfdc79fb30bb467565cf16cee8bd8f241cb061 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 11:40:02 +0200 Subject: Remove main --- src/python/gudhi/clustering/tomato.py | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index a3e304dc..901a9399 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -306,29 +306,3 @@ class Tomato: else: self.__n_clusters = None self.__merge_threshold = merge_threshold - - -if __name__ == "__main__": - import sys - - a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] - a = numpy.random.rand(500, 2) - t = Tomato(metric="euclidean", graph_type="knn", density_type="DTM", n_clusters=2, k=4, n_jobs=-1, eps=0.05,) - t.fit(a) - # print("neighbors\n",t.neighbors_) - # print() - # print("weights\n",t.weights_) - # print() - # print("diagram\n",t.diagram_) - # print() - print("max\n", t.max_weight_per_cc_, file=sys.stderr) - # print() - print("leaf labels\n", t.leaf_labels_) - # print() - print("labels\n", t.labels_) - # print() - print("children\n", t.children_) - # print() - t.n_clusters_ = 2 - print("labels\n", t.labels_) - t.plot_diagram() -- cgit v1.2.3 From 5a78dc70afe172e8f38bff09639be21fc92b1fb4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 11:42:51 +0200 Subject: license --- src/python/gudhi/clustering/_tomato.cc | 11 ++++++++++- src/python/gudhi/clustering/tomato.py | 10 +++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 638e1259..ce01834a 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -1,4 +1,13 @@ -// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix` +/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + * Author(s): Marc Glisse + * + * Copyright (C) 2020 Inria + * + * Modification(s): + * - YYYY/MM Author: Description of the modification + */ + #include #include #include diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 901a9399..29642662 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -1,10 +1,18 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + import numpy from ..point_cloud.knn import KNearestNeighbors from ..point_cloud.dtm import DTMDensity from ._tomato import * # The fit/predict interface is not so well suited... -# FIXME: choose if they are called weight, density, filtration, etc and be consistent. class Tomato: -- cgit v1.2.3 From 1e71054e2abca4181b85209f862227b3d7d948e1 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 13:29:15 +0200 Subject: requires --- src/python/gudhi/clustering/tomato.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 29642662..62fac8db 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -21,6 +21,9 @@ class Tomato: A few possible graph constructions and density estimators are provided for convenience, but it is perfectly natural to provide your own. + :Requires: `SciPy `_, `Scikit-learn `_ or others + (see :class:`~gudhi.point_cloud.knn.KNearestNeighbors`) in function of the options. + Attributes ---------- n_clusters_: int -- cgit v1.2.3 From cf617a8bf5b9701dad69b1d47b449654ab32453c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:34:12 +0200 Subject: rename function to hierarchy --- src/python/gudhi/clustering/_tomato.cc | 6 +++--- src/python/gudhi/clustering/tomato.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index ce01834a..f2e73ed6 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -47,8 +47,8 @@ struct Merge { }; template -auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, - ROrder const& rorder) { +auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to std::vector raw_cluster; raw_cluster.reserve(num_points); @@ -302,7 +302,7 @@ auto plaf(py::array_t ngb, py::a #endif PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; - m.def("doit", &plouf, "does the clustering"); + m.def("hierarchy", &hierarchy, "does the clustering"); // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 62fac8db..d5c5daac 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -229,7 +229,7 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights - self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit(self.neighbors_, weights) + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = hierarchy(self.neighbors_, weights) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) # TODO: deduplicate this code with the setters below -- cgit v1.2.3 From 125ad079fd4e07556b96e38b28c51807d349718c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:37:19 +0200 Subject: Move comment --- src/python/gudhi/clustering/_tomato.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index f2e73ed6..cd382d57 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -252,7 +252,10 @@ auto merge(py::array_t children, Cluster_inde return py::array(ret.size(), ret.data()); } -// Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? +// TODO: Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? +// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style +// ou overload (en virant forcecast?) +// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? auto plouf(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] @@ -306,9 +309,3 @@ PYBIND11_MODULE(_tomato, m) { // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } - -// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array -// -// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style -// ou overload (en virant forcecast?) -// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? -- cgit v1.2.3 From a3dddd2598d3b7f59e3d59049f71edf6e54e150c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:45:08 +0200 Subject: Remove dead code Some of it could be useful later, but I guess there is a limit to how much extra code I can keep in comments... --- src/python/gudhi/clustering/_tomato.cc | 35 +--------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index cd382d57..0ec703ca 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -144,21 +144,12 @@ auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const } // Maximum for each connected component std::vector max_cc; - // for(Point_index i = 0; i < num_points; ++i){ - // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) - // max_cc.push_back(density[order[i]]); - //} for (Cluster_index i = 0; i < n_raw_clusters; ++i) { if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]); } assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters); // TODO: create a "noise" cluster, merging all those not prominent enough? - // int nb_clusters=0; - // for(int i=0;i<(int)ds_base.size();++i){ - // if(ds_parent[i]!=i) continue; - // ds_data[i].rank=nb_clusters++; - //} // Replay the merges, in increasing order of prominence, to build the hierarchy std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; }); @@ -278,31 +269,7 @@ auto plouf(py::handle ngb, py::array_t ngb, py::array_t density, double threshold){ - py::buffer_info nbuf = ngb.request(); - if(nbuf.ndim!=2) - throw std::runtime_error("neighbors must be 2D"); - const int n=nbuf.shape[0]; - const int k=nbuf.shape[1]; - int*nptr=(int*)nbuf.ptr; - auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};}); - py::buffer_info wbuf = density.request(); - if(wbuf.ndim!=1) - throw std::runtime_error("density must be 1D"); - if(n!=wbuf.shape[0]) - throw std::runtime_error("incompatible sizes"); - double*d=(double*)wbuf.ptr; - return tomato(n,neighbors,d); -} -#endif + PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; m.def("hierarchy", &hierarchy, "does the clustering"); -- cgit v1.2.3 From 16e8f92f0635da668f9f4602f4b7bb4086045a9d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 19:11:56 +0200 Subject: Fix bad renaming --- src/python/gudhi/clustering/_tomato.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 0ec703ca..a76a2c3a 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -47,8 +47,8 @@ struct Merge { }; template -auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, - ROrder const& rorder) { +auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to std::vector raw_cluster; raw_cluster.reserve(num_points); @@ -247,7 +247,7 @@ auto merge(py::array_t children, Cluster_inde // py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style // ou overload (en virant forcecast?) // aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? -auto plouf(py::handle ngb, py::array_t density) { +auto hierarchy(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] // (I _think_ handle is ok and we don't need object here) @@ -273,6 +273,5 @@ auto plouf(py::handle ngb, py::array_t Date: Thu, 28 May 2020 18:40:11 +0200 Subject: Théo's comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/python/gudhi/clustering/tomato.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index d5c5daac..76b6a3c0 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -103,8 +103,11 @@ class Tomato: def fit(self, X, y=None, weights=None): """ Args: - X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". + X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points, + or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors + for each point (points are represented by their index, starting from 0) if graph_type is "manual". weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point + y: Not used, present here for API consistency with scikit-learn by convention. """ # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. @@ -229,6 +232,7 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights + # This is where the main computation happens self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = hierarchy(self.neighbors_, weights) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) @@ -281,10 +285,6 @@ class Tomato: ) plt.show() - # def predict(self, X): - # # X had better be the same as in fit() - # return self.labels_ - # Use set_params instead? @property def n_clusters_(self): -- cgit v1.2.3 From 8bbb9716d29f7fadb53e1241ab280bbeb446381b Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 9 Jun 2020 21:49:28 +0200 Subject: doc tweaks --- src/python/gudhi/clustering/tomato.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 76b6a3c0..133754b4 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -63,8 +63,9 @@ class Tomato: ): """ Args: - graph_type (str): 'manual', 'knn' or 'radius'. - density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. + graph_type (str): 'manual', 'knn' or 'radius'. Default is 'knn'. + density_type (str): 'manual', 'DTM', 'logDTM', 'KDE' or 'logKDE'. When you have many points, + 'KDE' and 'logKDE' tend to be slower. Default is 'logDTM'. metric (str|Callable): metric used when calculating the distance between instances in a feature array. Defaults to Minkowski of parameter p. kde_params (dict): if density_type is 'KDE' or 'logKDE', additional parameters passed directly to -- cgit v1.2.3 From 8723a3a88b35ee4b08115b07fb0f5fe813567a87 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Fri, 19 Jun 2020 15:38:41 +0200 Subject: Mention the 2 billion limit in the doc --- src/python/gudhi/clustering/tomato.py | 1 + 1 file changed, 1 insertion(+) (limited to 'src/python/gudhi') diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 133754b4..fbba3cc8 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -107,6 +107,7 @@ class Tomato: X ((n,d)-array of float|(n,n)-array of float|Sequence[Iterable[int]]): coordinates of the points, or distance matrix (full, not just a triangle) if metric is "precomputed", or list of neighbors for each point (points are represented by their index, starting from 0) if graph_type is "manual". + The number of points is currently limited to about 2 billion. weights (ndarray of shape (n_samples)): if density_type is 'manual', a density estimate at each point y: Not used, present here for API consistency with scikit-learn by convention. """ -- cgit v1.2.3