From 07c7e5841c961869b927875bbca91d10287f9fab Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 22:03:18 +0100 Subject: Initial commit of tomato --- src/python/gudhi/clustering/_tomato.cc | 271 +++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 src/python/gudhi/clustering/_tomato.cc (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc new file mode 100644 index 00000000..1c896174 --- /dev/null +++ b/src/python/gudhi/clustering/_tomato.cc @@ -0,0 +1,271 @@ +// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix` +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py=pybind11; + +template::value>> +int getint(int i){return i;} +template().template cast())> +int getint(T i){return i.template cast();} + +// Raw clusters are clusters obtained through single-linkage, no merging. + +typedef int Point_index; +typedef int Cluster_index; +struct Merge { Cluster_index first, second; double persist; }; + +template +auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){ + // point index --> index of raw cluster it belongs to + std::vector raw_cluster; raw_cluster.reserve(num_points); + // cluster index --> index of top point in the cluster + Cluster_index n_raw_clusters = 0; // current number of raw clusters seen + // + std::vector merges; + struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster + std::vector ds_base; + // boost::vector_property_map does resize(size+1) for every new element, don't use it + auto ds_data=boost::make_function_property_map([&ds_base](std::size_t n)->Data&{return ds_base[n];}); + auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data); + auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data); + boost::disjoint_sets ds(ds_rank,ds_parent); // on the clusters, not directly the points + std::vector> persistence; // diagram (finite points) + boost::container::flat_map adj_clusters; // first: the merged cluster, second: the raw cluster + // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set + + for(Point_index i = 0; i < num_points; ++i){ + // auto&& ngb = neighbors[order[i]]; + // TODO: have a specialization where we don't need python types and py::cast + // TODO: move py::cast and getint into Neighbors + py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work + adj_clusters.clear(); + Point_index j = i; // highest neighbor + //for(Point_index k : ngb) + for(auto k_py : ngb) + { + Point_index k = rorder[getint(k_py)]; + if(k >= i || k < 0) // ??? + continue; + if(k < j) + j = k; + Cluster_index rk=raw_cluster[k]; + adj_clusters.emplace(ds.find_set(rk),rk); + // does not insert if ck=ds.find_set(rk) already seen + // which rk we keep from those with the same ck is arbitrary + } + assert((Point_index)raw_cluster.size()==i); + if(i==j){ // local maximum -> new cluster + Cluster_index c = n_raw_clusters++; + ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access + ds.make_set(c); + ds_base[c].max=i; // max + raw_cluster.push_back(c); + continue; + }else{ // add i to the cluster of j + assert(j= d_i); + // Always merge (the non-hierarchical algorithm would only conditionally merge here + persistence.push_back({d_young, d_i}); + assert(ds.find_set(rj) != ds.find_set(rk)); + ds.link(cj,ck); + cj = ds.find_set(cj); + ds_base[cj].max=old; // just one parent, no need for find_set + // record the raw clusters, we don't know what will have already been merged. + merges.push_back({rj, rk, d_young-d_i}); + } + } + { + boost::counting_iterator b(0), e(ds_base.size()); + ds.compress_sets(b, e); + // Now we stop using find_sets and look at the parent directly + // rank is reused to rename clusters contiguously 0, 1, etc + } + // Maximum for each connected component + std::vector max_cc; + //for(Point_index i = 0; i < num_points; ++i){ + // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) + // max_cc.push_back(density[order[i]]); + //} + for(Cluster_index i = 0; i < n_raw_clusters; ++i){ + if(ds_base[i].parent == i) + max_cc.push_back(density[order[ds_base[i].max]]); + } + assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters); + + // TODO: create a "noise" cluster, merging all those not prominent enough? + //int nb_clusters=0; + //for(int i=0;i<(int)ds_base.size();++i){ + // if(ds_parent[i]!=i) continue; + // ds_data[i].rank=nb_clusters++; + //} + + // Replay the merges, in increasing order of prominence, to build the hierarchy + std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;}); + std::vector> children; children.reserve(merges.size()); + { + struct Dat { Cluster_index parent; int rank; Cluster_index name; }; + std::vector ds_bas(2*n_raw_clusters-1); + Cluster_index i; + auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); + auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); + auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); + boost::disjoint_sets ds(ds_ran,ds_par); + for(i=0;i raw_cluster_ordered(num_points); + for(int i=0; i children, Cluster_index n_leaves, Cluster_index n_final){ + // Should this really be an error? + if(n_final > n_leaves) + throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); + py::buffer_info cbuf = children.request(); + if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0)) + throw std::runtime_error("internal error: children have to be (n,2) or empty"); + const int n_merges=cbuf.shape[0]; + Cluster_index*d=(Cluster_index*)cbuf.ptr; + // Should this really be an error? + //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; + if(n_merges + n_final < n_leaves) + throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges)); + + struct Dat { Cluster_index parent; int rank; int name; }; + std::vector ds_bas(2*n_leaves-1); + auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); + auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); + auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); + boost::disjoint_sets ds(ds_ran,ds_par); + Cluster_index i; + for(i=0;i ret; ret.reserve(n_leaves); + for(Cluster_index j=0;j density){ + // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted + // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] + // (I _think_ handle is ok and we don't need object here) + py::buffer_info wbuf = density.request(); + if(wbuf.ndim!=1) + throw std::runtime_error("density must be 1D"); + const int n=wbuf.shape[0]; + double*d=(double*)wbuf.ptr; + // Vector { 0, 1, ..., n-1 } + std::vector order(boost::counting_iterator(0), boost::counting_iterator(n)); + // Permutation of the indices to get points in decreasing order of density + std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; }); + // Inverse permutation + std::vector rorder(n); + for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i; + // Used as: + // order[i] is the index of the point with i-th largest density + // rorder[i] is the rank of the i-th point in order of decreasing density + // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output) + return tomato(n, ngb, d, order, rorder); +} +#if 0 +struct Vec { + int const* base; + int k; + int const* begin()const{return base;} + int const* end()const{return base+k;} + Vec operator*()const{return *this;} +}; +auto plaf(py::array_t ngb, py::array_t density, double threshold){ + py::buffer_info nbuf = ngb.request(); + if(nbuf.ndim!=2) + throw std::runtime_error("neighbors must be 2D"); + const int n=nbuf.shape[0]; + const int k=nbuf.shape[1]; + int*nptr=(int*)nbuf.ptr; + auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};}); + py::buffer_info wbuf = density.request(); + if(wbuf.ndim!=1) + throw std::runtime_error("density must be 1D"); + if(n!=wbuf.shape[0]) + throw std::runtime_error("incompatible sizes"); + double*d=(double*)wbuf.ptr; + return tomato(n,neighbors,d); +} +#endif +PYBIND11_MODULE(_tomato, m) { + m.doc() = "Internals of tomato clustering"; + m.def("doit", &plouf, "does the clustering"); + //m.def("doit2", &plaf, "does the clustering faster"); + m.def("merge", &merge, "merge clusters"); +} + +// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array +// +// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style +// ou overload (en virant forcecast?) +// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? -- cgit v1.2.3 From 53579deb2d551752b503b7b76ac04885ec354470 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 23:19:34 +0100 Subject: Bug in gcc-8 --- src/python/gudhi/clustering/_tomato.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 1c896174..746c4254 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -18,8 +18,11 @@ namespace py=pybind11; template::value>> int getint(int i){return i;} -template().template cast())> -int getint(T i){return i.template cast();} +// Gcc-8 has a bug that breaks this version, fixed in gcc-9 +// template().template cast())> +// int getint(T i){return i.template cast();} +template +auto getint(T i)->decltype(i.template cast()){return i.template cast();} // Raw clusters are clusters obtained through single-linkage, no merging. -- cgit v1.2.3 From e5e0f9a9e96389eadc9e9c4bc493b88abcb6f89a Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Wed, 26 Feb 2020 23:24:42 +0100 Subject: formatting --- src/python/gudhi/clustering/_tomato.cc | 247 +++++++++++++++++-------------- src/python/gudhi/clustering/tomato.py | 262 ++++++++++++++++++++------------- 2 files changed, 299 insertions(+), 210 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 746c4254..87bd62e9 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -14,100 +14,117 @@ #include #include -namespace py=pybind11; +namespace py = pybind11; -template::value>> -int getint(int i){return i;} +template ::value>> +int getint(int i) { + return i; +} // Gcc-8 has a bug that breaks this version, fixed in gcc-9 // template().template cast())> // int getint(T i){return i.template cast();} -template -auto getint(T i)->decltype(i.template cast()){return i.template cast();} +template +auto getint(T i) -> decltype(i.template cast()) { + return i.template cast(); +} // Raw clusters are clusters obtained through single-linkage, no merging. typedef int Point_index; typedef int Cluster_index; -struct Merge { Cluster_index first, second; double persist; }; +struct Merge { + Cluster_index first, second; + double persist; +}; -template -auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& density, Order const& order, ROrder const& rorder){ +template +auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to - std::vector raw_cluster; raw_cluster.reserve(num_points); + std::vector raw_cluster; + raw_cluster.reserve(num_points); // cluster index --> index of top point in the cluster - Cluster_index n_raw_clusters = 0; // current number of raw clusters seen + Cluster_index n_raw_clusters = 0; // current number of raw clusters seen // std::vector merges; - struct Data { Cluster_index parent; int rank; Point_index max; }; // information on a cluster + struct Data { + Cluster_index parent; + int rank; + Point_index max; + }; // information on a cluster std::vector ds_base; // boost::vector_property_map does resize(size+1) for every new element, don't use it - auto ds_data=boost::make_function_property_map([&ds_base](std::size_t n)->Data&{return ds_base[n];}); - auto ds_parent=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_data); - auto ds_rank=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_data); - boost::disjoint_sets ds(ds_rank,ds_parent); // on the clusters, not directly the points - std::vector> persistence; // diagram (finite points) - boost::container::flat_map adj_clusters; // first: the merged cluster, second: the raw cluster - // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only insert in the vector if merged is absent from the set + auto ds_data = + boost::make_function_property_map([&ds_base](std::size_t n) -> Data& { return ds_base[n]; }); + auto ds_parent = + boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_data); + auto ds_rank = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_data); + boost::disjoint_sets ds( + ds_rank, ds_parent); // on the clusters, not directly the points + std::vector> persistence; // diagram (finite points) + boost::container::flat_map + adj_clusters; // first: the merged cluster, second: the raw cluster + // we only care about the raw cluster, we could use a vector to store the second, store first into a set, and only + // insert in the vector if merged is absent from the set - for(Point_index i = 0; i < num_points; ++i){ + for (Point_index i = 0; i < num_points; ++i) { // auto&& ngb = neighbors[order[i]]; // TODO: have a specialization where we don't need python types and py::cast // TODO: move py::cast and getint into Neighbors - py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work + py::object ngb = neighbors[py::cast(order[i])]; // auto&& also seems to work adj_clusters.clear(); - Point_index j = i; // highest neighbor - //for(Point_index k : ngb) - for(auto k_py : ngb) - { + Point_index j = i; // highest neighbor + // for(Point_index k : ngb) + for (auto k_py : ngb) { Point_index k = rorder[getint(k_py)]; - if(k >= i || k < 0) // ??? - continue; - if(k < j) - j = k; - Cluster_index rk=raw_cluster[k]; - adj_clusters.emplace(ds.find_set(rk),rk); + if (k >= i || k < 0) // ??? + continue; + if (k < j) j = k; + Cluster_index rk = raw_cluster[k]; + adj_clusters.emplace(ds.find_set(rk), rk); // does not insert if ck=ds.find_set(rk) already seen // which rk we keep from those with the same ck is arbitrary } - assert((Point_index)raw_cluster.size()==i); - if(i==j){ // local maximum -> new cluster + assert((Point_index)raw_cluster.size() == i); + if (i == j) { // local maximum -> new cluster Cluster_index c = n_raw_clusters++; - ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access + ds_base.emplace_back(); // could be integrated in ds_data, but then we would check the size for every access ds.make_set(c); - ds_base[c].max=i; // max + ds_base[c].max = i; // max raw_cluster.push_back(c); continue; - }else{ // add i to the cluster of j - assert(j= d_i); // Always merge (the non-hierarchical algorithm would only conditionally merge here persistence.push_back({d_young, d_i}); assert(ds.find_set(rj) != ds.find_set(rk)); - ds.link(cj,ck); + ds.link(cj, ck); cj = ds.find_set(cj); - ds_base[cj].max=old; // just one parent, no need for find_set + ds_base[cj].max = old; // just one parent, no need for find_set // record the raw clusters, we don't know what will have already been merged. - merges.push_back({rj, rk, d_young-d_i}); + merges.push_back({rj, rk, d_young - d_i}); } } { @@ -118,121 +135,137 @@ auto tomato(Point_index num_points, Neighbors const&neighbors, Density const& de } // Maximum for each connected component std::vector max_cc; - //for(Point_index i = 0; i < num_points; ++i){ + // for(Point_index i = 0; i < num_points; ++i){ // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) // max_cc.push_back(density[order[i]]); //} - for(Cluster_index i = 0; i < n_raw_clusters; ++i){ - if(ds_base[i].parent == i) - max_cc.push_back(density[order[ds_base[i].max]]); + for (Cluster_index i = 0; i < n_raw_clusters; ++i) { + if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]); } - assert((Cluster_index)(merges.size()+max_cc.size())==n_raw_clusters); + assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters); // TODO: create a "noise" cluster, merging all those not prominent enough? - //int nb_clusters=0; - //for(int i=0;i<(int)ds_base.size();++i){ + // int nb_clusters=0; + // for(int i=0;i<(int)ds_base.size();++i){ // if(ds_parent[i]!=i) continue; // ds_data[i].rank=nb_clusters++; //} // Replay the merges, in increasing order of prominence, to build the hierarchy - std::sort(merges.begin(), merges.end(), [](Merge const&a, Merge const&b){return a.persist < b.persist;}); - std::vector> children; children.reserve(merges.size()); + std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; }); + std::vector> children; + children.reserve(merges.size()); { - struct Dat { Cluster_index parent; int rank; Cluster_index name; }; - std::vector ds_bas(2*n_raw_clusters-1); + struct Dat { + Cluster_index parent; + int rank; + Cluster_index name; + }; + std::vector ds_bas(2 * n_raw_clusters - 1); Cluster_index i; - auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); - auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); - auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); - boost::disjoint_sets ds(ds_ran,ds_par); - for(i=0;i([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets ds(ds_ran, ds_par); + for (i = 0; i < n_raw_clusters; ++i) { + ds.make_set(i); + ds_bas[i].name = i; + } + for (Merge const& m : merges) { Cluster_index j = ds.find_set(m.first); Cluster_index k = ds.find_set(m.second); - assert(j!=k); - children.push_back({ds_bas[j].name,ds_bas[k].name}); + assert(j != k); + children.push_back({ds_bas[j].name, ds_bas[k].name}); ds.make_set(i); - ds.link(i,j); - ds.link(i,k); - ds_bas[ds.find_set(i)].name=i; + ds.link(i, j); + ds.link(i, k); + ds_bas[ds.find_set(i)].name = i; ++i; } } std::vector raw_cluster_ordered(num_points); - for(int i=0; i children, Cluster_index n_leaves, Cluster_index n_final){ +auto merge(py::array_t children, Cluster_index n_leaves, Cluster_index n_final) { // Should this really be an error? - if(n_final > n_leaves) + if (n_final > n_leaves) throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); py::buffer_info cbuf = children.request(); - if((cbuf.ndim!=2 || cbuf.shape[1]!=2) && (cbuf.ndim!=1 || cbuf.shape[0]!=0)) + if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0)) throw std::runtime_error("internal error: children have to be (n,2) or empty"); - const int n_merges=cbuf.shape[0]; - Cluster_index*d=(Cluster_index*)cbuf.ptr; + const int n_merges = cbuf.shape[0]; + Cluster_index* d = (Cluster_index*)cbuf.ptr; // Should this really be an error? - //std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; - if(n_merges + n_final < n_leaves) - throw std::runtime_error(std::string("The number of clusters required ")+std::to_string(n_final)+" is smaller than the number of connected components "+std::to_string(n_leaves-n_merges)); + // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; + if (n_merges + n_final < n_leaves) + throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) + + " is smaller than the number of connected components " + + std::to_string(n_leaves - n_merges)); - struct Dat { Cluster_index parent; int rank; int name; }; - std::vector ds_bas(2*n_leaves-1); - auto ds_dat=boost::make_function_property_map([&ds_bas](std::size_t n)->Dat&{return ds_bas[n];}); - auto ds_par=boost::make_transform_value_property_map([](auto&p)->Cluster_index&{return p.parent;},ds_dat); - auto ds_ran=boost::make_transform_value_property_map([](auto&p)->int&{return p.rank;},ds_dat); - boost::disjoint_sets ds(ds_ran,ds_par); + struct Dat { + Cluster_index parent; + int rank; + int name; + }; + std::vector ds_bas(2 * n_leaves - 1); + auto ds_dat = boost::make_function_property_map([&ds_bas](std::size_t n) -> Dat& { return ds_bas[n]; }); + auto ds_par = boost::make_transform_value_property_map([](auto& p) -> Cluster_index& { return p.parent; }, ds_dat); + auto ds_ran = boost::make_transform_value_property_map([](auto& p) -> int& { return p.rank; }, ds_dat); + boost::disjoint_sets ds(ds_ran, ds_par); Cluster_index i; - for(i=0;i ret; ret.reserve(n_leaves); - for(Cluster_index j=0;j ret; + ret.reserve(n_leaves); + for (Cluster_index j = 0; j < n_leaves; ++j) { Cluster_index k = ds.find_set(j); - if(ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++; + if (ds_bas[k].name == -1) ds_bas[k].name = next_cluster_name++; ret.push_back(ds_bas[k].name); } return py::array(ret.size(), ret.data()); } // Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? -auto plouf(py::handle ngb, py::array_t density){ +auto plouf(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] // (I _think_ handle is ok and we don't need object here) py::buffer_info wbuf = density.request(); - if(wbuf.ndim!=1) - throw std::runtime_error("density must be 1D"); - const int n=wbuf.shape[0]; - double*d=(double*)wbuf.ptr; + if (wbuf.ndim != 1) throw std::runtime_error("density must be 1D"); + const int n = wbuf.shape[0]; + double* d = (double*)wbuf.ptr; // Vector { 0, 1, ..., n-1 } std::vector order(boost::counting_iterator(0), boost::counting_iterator(n)); // Permutation of the indices to get points in decreasing order of density - std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j){ return d[i] > d[j]; }); + std::sort(std::begin(order), std::end(order), [=](Point_index i, Point_index j) { return d[i] > d[j]; }); // Inverse permutation std::vector rorder(n); - for(Point_index i : boost::irange(0, n)) rorder[order[i]] = i; + for (Point_index i : boost::irange(0, n)) rorder[order[i]] = i; // Used as: // order[i] is the index of the point with i-th largest density // rorder[i] is the rank of the i-th point in order of decreasing density - // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the output) + // TODO: put a wrapper on ngb and d so we don't need to pass (r)order (there is still the issue of reordering the + // output) return tomato(n, ngb, d, order, rorder); } #if 0 @@ -263,7 +296,7 @@ auto plaf(py::array_t ngb, py::a PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; m.def("doit", &plouf, "does the clustering"); - //m.def("doit2", &plaf, "does the clustering faster"); + // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 467afe0e..19d6600f 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -29,7 +29,18 @@ class Tomato: params_: dict Parameters like input_type, metric, etc """ - def __init__(self, input_type='points', metric=None, graph_type=None, density_type='manual', n_clusters=None, merge_threshold=None, eliminate_threshold=None, **params): + + def __init__( + self, + input_type="points", + metric=None, + graph_type=None, + density_type="manual", + n_clusters=None, + merge_threshold=None, + eliminate_threshold=None, + **params + ): """ Each parameter has a corresponding attribute, like self.merge_threshold_, that can be changed later. @@ -72,179 +83,207 @@ class Tomato: # TODO: First detect if this is a new call with the same data (only threshold changed?) # TODO: less code duplication (subroutines?), less spaghetti, but don't compute neighbors twice if not needed. Clear error message for missing or contradictory parameters. if weights: - density_type = 'manual' + density_type = "manual" else: density_type = self.density_type_ - assert density_type != 'manual' - if density_type == 'manual': + assert density_type != "manual" + if density_type == "manual": raise ValueError("If density_type is 'manual', you must provide weights to fit()") - if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'radius': + if self.input_type_ == "distance_matrix" and self.graph_type_ == "radius": X = numpy.array(X) - r = self.params_['r'] + r = self.params_["r"] self.neighbors_ = [numpy.nonzero(l <= r) for l in X] - if self.input_type_ == 'distance_matrix' and self.graph_type_ == 'knn': - k = self.params_['k'] - self.neighbors_ = numpy.argpartition(X, k-1)[:,0:k] + if self.input_type_ == "distance_matrix" and self.graph_type_ == "knn": + k = self.params_["k"] + self.neighbors_ = numpy.argpartition(X, k - 1)[:, 0:k] - if self.input_type_ == 'neighbors': + if self.input_type_ == "neighbors": self.neighbors_ = X - assert density_type == 'manual' + assert density_type == "manual" - if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get('p_DTM', 2) - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - k_DTM = self.params_.get('k_DTM', k) + q = self.params_.get("p_DTM", 2) + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + k_DTM = self.params_.get("k_DTM", k) kmax = max(k, k_DTM) - if self.params_.get('gpu'): + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: # Requires a version of pykeops strictly more recent than 1.3 - dd, nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin_argKmin(kmax, dim=1) - elif p == 2: # Any even integer? - dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin_argKmin(kmax, dim=1) + dd, nn = ( + (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) + .abs() + .max(-1) + .Kmin_argKmin(kmax, dim=1) + ) + elif p == 2: # Any even integer? + dd, nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p) + .sum(-1) + .Kmin_argKmin(kmax, dim=1) + ) else: - dd, nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin_argKmin(kmax, dim=1) + dd, nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) + .sum(-1) + .Kmin_argKmin(kmax, dim=1) + ) if k < kmax: nn = nn[:, 0:k] if k_DTM < kmax: dd = dd[:, 0:k_DTM] - assert q != numpy.inf # for now + assert q != numpy.inf # for now if p != numpy.inf: qp = float(q) / p else: qp = q if qp != 1: - dd = dd**qp + dd = dd ** qp weights = dd.sum(-1) # **1/q is a waste of time, whether we take another **-.25 or a log # Back to the CPU. Not sure this is necessary, or the right way to do it. weights = numpy.array(weights) self.neighbors_ = numpy.array(nn) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} dd, self.neighbors_ = kdtree.query(self.points_, k=kmax, p=p, **qargs) if k < kmax: self.neighbors_ = self.neighbors_[:, 0:k] if k_DTM < kmax: dd = dd[:, 0:k_DTM] - #weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd**q).sum(-1) + # weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd ** q).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.input_type_ == 'points' and self.graph_type_ == 'knn' and self.density_type_ not in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ == "knn" and self.density_type_ not in {"DTM", "logDTM"}: self.points_ = X - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - if self.params_.get('gpu'): + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: # Requires a version of pykeops strictly more recent than 1.3 - nn = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).argKmin(k, dim=1) - elif p == 2: # Any even integer? - nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).argKmin(k, dim=1) + nn = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).argKmin(k, dim=1) + elif p == 2: # Any even integer? + nn = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).argKmin(k, dim=1) else: - nn = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).argKmin(k, dim=1) + nn = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p).sum(-1).argKmin(k, dim=1) + ) # Back to the CPU. Not sure this is necessary, or the right way to do it. self.neighbors_ = numpy.array(nn) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) # FIXME 'p' - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} _, self.neighbors_ = kdtree.query(self.points_, k=k, p=p, **qargs) - if self.input_type_ == 'points' and self.graph_type_ != 'knn' and self.density_type_ in {'DTM', 'logDTM'}: + if self.input_type_ == "points" and self.graph_type_ != "knn" and self.density_type_ in {"DTM", "logDTM"}: self.points_ = X - q = self.params_.get('p_DTM', 2) - p = self.params_.get('p', 2) - k = self.params_.get('k', 10) - k_DTM = self.params_.get('k_DTM', k) - if self.params_.get('gpu'): + q = self.params_.get("p_DTM", 2) + p = self.params_.get("p", 2) + k = self.params_.get("k", 10) + k_DTM = self.params_.get("k_DTM", k) + if self.params_.get("gpu"): import torch from pykeops.torch import LazyTensor + # 'float64' is slow except on super expensive GPUs. Allow it with some param? XX = torch.tensor(self.points_, dtype=torch.float32) if p == numpy.inf: - assert False # Not supported??? - dd = (LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs().max(-1).Kmin(k_DTM, dim=1) - elif p == 2: # Any even integer? - dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:]))**p).sum(-1).Kmin(k_DTM, dim=1) + assert False # Not supported??? + dd = (LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs().max(-1).Kmin(k_DTM, dim=1) + elif p == 2: # Any even integer? + dd = ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])) ** p).sum(-1).Kmin(k_DTM, dim=1) else: - dd = ((LazyTensor(XX[:,None,:])-LazyTensor(XX[None,:,:])).abs()**p).sum(-1).Kmin(k_DTM, dim=1) - assert q != numpy.inf # for now + dd = ( + ((LazyTensor(XX[:, None, :]) - LazyTensor(XX[None, :, :])).abs() ** p) + .sum(-1) + .Kmin(k_DTM, dim=1) + ) + assert q != numpy.inf # for now if p != numpy.inf: qp = float(q) / p else: qp = q if qp != 1: - dd = dd**qp + dd = dd ** qp weights = dd.sum(-1) # **1/q is a waste of time, whether we take another **-.25 or a log # Back to the CPU. Not sure this is necessary, or the right way to do it. weights = numpy.array(weights) - else: # CPU + else: # CPU from scipy.spatial import cKDTree + kdtree = cKDTree(self.points_) - qargs = { k:v for k,v in self.params_.items() if k in {'eps', 'n_jobs'}} + qargs = {k: v for k, v in self.params_.items() if k in {"eps", "n_jobs"}} dd, _ = kdtree.query(self.points_, k=k_DTM, p=p, **qargs) - #weights = numpy.linalg.norm(dd, axis=1, ord=q) - weights = (dd**q).sum(-1) + # weights = numpy.linalg.norm(dd, axis=1, ord=q) + weights = (dd ** q).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.input_type_ == 'distance_matrix' and self.density_type_ in {'DTM', 'logDTM'}: - q = self.params_.get('p_DTM', 2) + if self.input_type_ == "distance_matrix" and self.density_type_ in {"DTM", "logDTM"}: + q = self.params_.get("p_DTM", 2) X = numpy.array(X) - k = self.params_.get('k_DTM') + k = self.params_.get("k_DTM") if not k: - k = self.params_['k'] - q = self.params_.get('p_DTM', 2) - weights = (numpy.partition(X)**q, k-1).sum(-1) + k = self.params_["k"] + q = self.params_.get("p_DTM", 2) + weights = (numpy.partition(X) ** q, k - 1).sum(-1) # TODO: check the formula in Fred's paper - if self.density_type_ == 'DTM': - weights = weights ** (-.25 / q) + if self.density_type_ == "DTM": + weights = weights ** (-0.25 / q) else: weights = -numpy.log(weights) - if self.density_type_ == 'kde': + if self.density_type_ == "kde": # FIXME: replace most assert with raise ValueError("blabla") - assert self.input_type_ == 'points' - kde_params = self.params_.get('kde_params', dict()) + assert self.input_type_ == "points" + kde_params = self.params_.get("kde_params", dict()) from sklearn.neighbors import KernelDensity + weights = KernelDensity(**kde_params).fit(X).score_samples(X) - if self.params_.get('symmetrize_graph'): + if self.params_.get("symmetrize_graph"): self.neighbors_ = [set(line) for line in self.neighbors_] - for i,line in enumerate(self.neighbors_): + for i, line in enumerate(self.neighbors_): line.discard(i) for j in line: self.neighbors_[j].add(i) - self.weights_=weights # TODO remove - self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit(list(self.neighbors_), weights) + self.weights_ = weights # TODO remove + self.leaf_labels_, self.children_, self.diagram_, self.max_density_per_cc_ = tomato.doit( + list(self.neighbors_), weights + ) self.n_leaves_ = len(self.max_density_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_density_per_cc_) + len(self.children_) if self.__n_clusters: @@ -265,65 +304,82 @@ class Tomato: """ """ import matplotlib.pyplot as plt - plt.plot(self.diagram_[:,0],self.diagram_[:,1],'ro') - l = self.diagram_[:,1].min() - r = max(self.diagram_[:,0].max(), self.max_density_per_cc_.max()) - plt.plot([l,r],[l,r]) - plt.plot(self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape,1.1*l-.1*r),'ro',color='green') + + plt.plot(self.diagram_[:, 0], self.diagram_[:, 1], "ro") + l = self.diagram_[:, 1].min() + r = max(self.diagram_[:, 0].max(), self.max_density_per_cc_.max()) + plt.plot([l, r], [l, r]) + plt.plot( + self.max_density_per_cc_, numpy.full(self.max_density_per_cc_.shape, 1.1 * l - 0.1 * r), "ro", color="green" + ) plt.show() -# def predict(self, X): -# # X had better be the same as in fit() -# return labels_ + # def predict(self, X): + # # X had better be the same as in fit() + # return labels_ # Use set_params instead? @property def n_clusters_(self): return self.__n_clusters + @n_clusters_.setter def n_clusters_(self, n_clusters): if n_clusters == self.__n_clusters: return self.__n_clusters = n_clusters self.__merge_threshold = None - if hasattr(self, 'leaf_labels_'): + if hasattr(self, "leaf_labels_"): renaming = tomato.merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] @property def merge_threshold_(self): return self.__merge_threshold + @merge_threshold_.setter def merge_threshold_(self, merge_threshold): if merge_threshold == self.__merge_threshold: return - if hasattr(self, 'leaf_labels_'): - self.n_clusters_ = numpy.count_nonzero(self.diagram_[1]-self.diagram_[0] > merge_threshold) + len(max_density_per_cc_) + if hasattr(self, "leaf_labels_"): + self.n_clusters_ = numpy.count_nonzero(self.diagram_[1] - self.diagram_[0] > merge_threshold) + len( + max_density_per_cc_ + ) else: self.__n_clusters = None self.__merge_threshold = merge_threshold -if __name__ == '__main__': +if __name__ == "__main__": import sys - a=[(1,2),(1.1,1.9),(.9,1.8),(10,0),(10.1,.05),(10.2,-.1),(5.4,0)] - a=numpy.random.rand(500,2) - t=Tomato(input_type='points', metric='Euclidean', graph_type='knn', density_type='DTM', n_clusters=2, k=4, n_jobs=-1, eps=.05) + + a = [(1, 2), (1.1, 1.9), (0.9, 1.8), (10, 0), (10.1, 0.05), (10.2, -0.1), (5.4, 0)] + a = numpy.random.rand(500, 2) + t = Tomato( + input_type="points", + metric="Euclidean", + graph_type="knn", + density_type="DTM", + n_clusters=2, + k=4, + n_jobs=-1, + eps=0.05, + ) t.fit(a) - #print("neighbors\n",t.neighbors_) - #print() - #print("weights\n",t.weights_) - #print() - #print("diagram\n",t.diagram_) - #print() - print("max\n",t.max_density_per_cc_, file=sys.stderr) - #print() - print("leaf labels\n",t.leaf_labels_) - #print() - print("labels\n",t.labels_) - #print() - print("children\n",t.children_) - #print() + # print("neighbors\n",t.neighbors_) + # print() + # print("weights\n",t.weights_) + # print() + # print("diagram\n",t.diagram_) + # print() + print("max\n", t.max_density_per_cc_, file=sys.stderr) + # print() + print("leaf labels\n", t.leaf_labels_) + # print() + print("labels\n", t.labels_) + # print() + print("children\n", t.children_) + # print() t.n_clusters_ = 2 - print("labels\n",t.labels_) + print("labels\n", t.labels_) t.plot_diagram() -- cgit v1.2.3 From c6f519abe3d2fe424d9982ad8139ff2a86119bca Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 25 May 2020 20:55:06 +0200 Subject: Make specifying an impossible number of clusters a warning --- src/python/gudhi/clustering/_tomato.cc | 18 ++++++++---------- src/python/gudhi/clustering/tomato.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 87bd62e9..638e1259 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -195,21 +195,19 @@ auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& d } auto merge(py::array_t children, Cluster_index n_leaves, Cluster_index n_final) { - // Should this really be an error? - if (n_final > n_leaves) - throw std::runtime_error("The number of clusters required is larger than the number of mini-clusters"); + if (n_final > n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is larger than the number of mini-clusters " << n_leaves << '\n'; + n_final = n_leaves; // or return something special and let Tomato use leaf_labels_? + } py::buffer_info cbuf = children.request(); if ((cbuf.ndim != 2 || cbuf.shape[1] != 2) && (cbuf.ndim != 1 || cbuf.shape[0] != 0)) throw std::runtime_error("internal error: children have to be (n,2) or empty"); const int n_merges = cbuf.shape[0]; Cluster_index* d = (Cluster_index*)cbuf.ptr; - // Should this really be an error? - // std::cerr << "n_merges: " << n_merges << ", n_final: " << n_final << ", n_leaves: " << n_leaves << '\n'; - if (n_merges + n_final < n_leaves) - throw std::runtime_error(std::string("The number of clusters required ") + std::to_string(n_final) + - " is smaller than the number of connected components " + - std::to_string(n_leaves - n_merges)); - + if (n_merges + n_final < n_leaves) { + std::cerr << "The number of clusters required " << n_final << " is smaller than the number of connected components " << n_leaves - n_merges << '\n'; + n_final = n_leaves - n_merges; + } struct Dat { Cluster_index parent; int rank; diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 7e97819b..867c46a1 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -24,11 +24,11 @@ class Tomato: cluster labels for each point, at the very bottom of the hierarchy labels_: ndarray of shape (n_samples,) cluster labels for each point, after merging - diagram_: ndarray of shape (n_leaves_,2) + diagram_: ndarray of shape (n_leaves_, 2) persistence diagram (only the finite points) max_weight_per_cc_: ndarray of shape (n_connected_components,) maximum of the density function on each connected component. This corresponds to the abscissa of infinite points in the diagram - children_: ndarray of shape (n_leaves_-1,2) + children_: ndarray of shape (n_leaves_-n_connected_components, 2) The children of each non-leaf node. Values less than n_leaves_ correspond to leaves of the tree. A node i greater than or equal to n_leaves_ is a non-leaf node and has children children_[i - n_leaves_]. Alternatively at the i-th iteration, children[i][0] and children[i][1] are merged to form node n_leaves_ + i weights_: ndarray of shape (n_samples,) weights of the points, as computed by the density estimator or provided by the user @@ -206,6 +206,7 @@ class Tomato: ) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) + # TODO: deduplicate this code with the setters below if self.__merge_threshold: assert not self.__n_clusters self.__n_clusters = numpy.count_nonzero( @@ -215,6 +216,9 @@ class Tomato: # TODO: set corresponding merge_threshold? renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible. + # TODO: check for impossible situations before calling merge. + self.__n_clusters = self.labels_.max() + 1 else: self.labels_ = self.leaf_labels_ self.__n_clusters = self.n_leaves_ @@ -269,6 +273,8 @@ class Tomato: if hasattr(self, "leaf_labels_"): renaming = merge(self.children_, self.n_leaves_, self.__n_clusters) self.labels_ = renaming[self.leaf_labels_] + # In case the user asked for something impossible + self.__n_clusters = self.labels_.max() + 1 @property def merge_threshold_(self): -- cgit v1.2.3 From 5a78dc70afe172e8f38bff09639be21fc92b1fb4 Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 11:42:51 +0200 Subject: license --- src/python/gudhi/clustering/_tomato.cc | 11 ++++++++++- src/python/gudhi/clustering/tomato.py | 10 +++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 638e1259..ce01834a 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -1,4 +1,13 @@ -// g++ -O3 -Wall -shared -fPIC `python3 -m pybind11 --includes` XXX.cpp -o tomato`python3-config --extension-suffix` +/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. + * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. + * Author(s): Marc Glisse + * + * Copyright (C) 2020 Inria + * + * Modification(s): + * - YYYY/MM Author: Description of the modification + */ + #include #include #include diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 901a9399..29642662 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -1,10 +1,18 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Marc Glisse +# +# Copyright (C) 2020 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + import numpy from ..point_cloud.knn import KNearestNeighbors from ..point_cloud.dtm import DTMDensity from ._tomato import * # The fit/predict interface is not so well suited... -# FIXME: choose if they are called weight, density, filtration, etc and be consistent. class Tomato: -- cgit v1.2.3 From cf617a8bf5b9701dad69b1d47b449654ab32453c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:34:12 +0200 Subject: rename function to hierarchy --- src/python/gudhi/clustering/_tomato.cc | 6 +++--- src/python/gudhi/clustering/tomato.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index ce01834a..f2e73ed6 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -47,8 +47,8 @@ struct Merge { }; template -auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, - ROrder const& rorder) { +auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to std::vector raw_cluster; raw_cluster.reserve(num_points); @@ -302,7 +302,7 @@ auto plaf(py::array_t ngb, py::a #endif PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; - m.def("doit", &plouf, "does the clustering"); + m.def("hierarchy", &hierarchy, "does the clustering"); // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } diff --git a/src/python/gudhi/clustering/tomato.py b/src/python/gudhi/clustering/tomato.py index 62fac8db..d5c5daac 100644 --- a/src/python/gudhi/clustering/tomato.py +++ b/src/python/gudhi/clustering/tomato.py @@ -229,7 +229,7 @@ class Tomato: self.neighbors_[j].add(i) self.weights_ = weights - self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = doit(self.neighbors_, weights) + self.leaf_labels_, self.children_, self.diagram_, self.max_weight_per_cc_ = hierarchy(self.neighbors_, weights) self.n_leaves_ = len(self.max_weight_per_cc_) + len(self.children_) assert self.leaf_labels_.max() + 1 == len(self.max_weight_per_cc_) + len(self.children_) # TODO: deduplicate this code with the setters below -- cgit v1.2.3 From 125ad079fd4e07556b96e38b28c51807d349718c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:37:19 +0200 Subject: Move comment --- src/python/gudhi/clustering/_tomato.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index f2e73ed6..cd382d57 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -252,7 +252,10 @@ auto merge(py::array_t children, Cluster_inde return py::array(ret.size(), ret.data()); } -// Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? +// TODO: Do a special version when ngb is a numpy array, where we can cast to int[k][n] ? +// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style +// ou overload (en virant forcecast?) +// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? auto plouf(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] @@ -306,9 +309,3 @@ PYBIND11_MODULE(_tomato, m) { // m.def("doit2", &plaf, "does the clustering faster"); m.def("merge", &merge, "merge clusters"); } - -// https://github.com/pybind/pybind11/issues/1042 pour convertir vector en numpy array -// -// py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style -// ou overload (en virant forcecast?) -// aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? -- cgit v1.2.3 From a3dddd2598d3b7f59e3d59049f71edf6e54e150c Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 18:45:08 +0200 Subject: Remove dead code Some of it could be useful later, but I guess there is a limit to how much extra code I can keep in comments... --- src/python/gudhi/clustering/_tomato.cc | 35 +--------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index cd382d57..0ec703ca 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -144,21 +144,12 @@ auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const } // Maximum for each connected component std::vector max_cc; - // for(Point_index i = 0; i < num_points; ++i){ - // if(ds_base[ds_base[raw_cluster[i]].parent].max == i) - // max_cc.push_back(density[order[i]]); - //} for (Cluster_index i = 0; i < n_raw_clusters; ++i) { if (ds_base[i].parent == i) max_cc.push_back(density[order[ds_base[i].max]]); } assert((Cluster_index)(merges.size() + max_cc.size()) == n_raw_clusters); // TODO: create a "noise" cluster, merging all those not prominent enough? - // int nb_clusters=0; - // for(int i=0;i<(int)ds_base.size();++i){ - // if(ds_parent[i]!=i) continue; - // ds_data[i].rank=nb_clusters++; - //} // Replay the merges, in increasing order of prominence, to build the hierarchy std::sort(merges.begin(), merges.end(), [](Merge const& a, Merge const& b) { return a.persist < b.persist; }); @@ -278,31 +269,7 @@ auto plouf(py::handle ngb, py::array_t ngb, py::array_t density, double threshold){ - py::buffer_info nbuf = ngb.request(); - if(nbuf.ndim!=2) - throw std::runtime_error("neighbors must be 2D"); - const int n=nbuf.shape[0]; - const int k=nbuf.shape[1]; - int*nptr=(int*)nbuf.ptr; - auto neighbors=boost::adaptors::transform(boost::irange(0,n),[=](int i){return Vec{nptr+i*k,k};}); - py::buffer_info wbuf = density.request(); - if(wbuf.ndim!=1) - throw std::runtime_error("density must be 1D"); - if(n!=wbuf.shape[0]) - throw std::runtime_error("incompatible sizes"); - double*d=(double*)wbuf.ptr; - return tomato(n,neighbors,d); -} -#endif + PYBIND11_MODULE(_tomato, m) { m.doc() = "Internals of tomato clustering"; m.def("hierarchy", &hierarchy, "does the clustering"); -- cgit v1.2.3 From 16e8f92f0635da668f9f4602f4b7bb4086045a9d Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Tue, 26 May 2020 19:11:56 +0200 Subject: Fix bad renaming --- src/python/gudhi/clustering/_tomato.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'src/python/gudhi/clustering/_tomato.cc') diff --git a/src/python/gudhi/clustering/_tomato.cc b/src/python/gudhi/clustering/_tomato.cc index 0ec703ca..a76a2c3a 100644 --- a/src/python/gudhi/clustering/_tomato.cc +++ b/src/python/gudhi/clustering/_tomato.cc @@ -47,8 +47,8 @@ struct Merge { }; template -auto hierarchy(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, - ROrder const& rorder) { +auto tomato(Point_index num_points, Neighbors const& neighbors, Density const& density, Order const& order, + ROrder const& rorder) { // point index --> index of raw cluster it belongs to std::vector raw_cluster; raw_cluster.reserve(num_points); @@ -247,7 +247,7 @@ auto merge(py::array_t children, Cluster_inde // py::isinstance> (ou py::isinstance et tester dtype) et flags&c_style // ou overload (en virant forcecast?) // aussi le faire au cas où on n'aurait pas un tableau, mais où chaque liste de voisins serait un tableau ? -auto plouf(py::handle ngb, py::array_t density) { +auto hierarchy(py::handle ngb, py::array_t density) { // used to be py::iterable ngb, but that's inconvenient if it doesn't come pre-sorted // use py::handle and check if [] (aka __getitem__) works? But then we need to build an object to pass it to [] // (I _think_ handle is ok and we don't need object here) @@ -273,6 +273,5 @@ auto plouf(py::handle ngb, py::array_t