/* This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. * See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. * Author(s): Pawel Dlotko * * Copyright (C) 2016 Inria * * Modification(s): * - YYYY/MM Author: Description of the modification * - 2019/12 Vincent Rouvreau: Fix #118 - Make histogram_of_lengths and cumulative_histogram_of_lengths * return the exact number_of_bins (was failing on x86) */ #ifndef PERSISTENCE_INTERVALS_H_ #define PERSISTENCE_INTERVALS_H_ // gudhi include #include // standard include #include #include #include #include #include #include #include #include #include namespace Gudhi { namespace Persistence_representations { /** * This class implements the following concepts: Vectorized_topological_data, Topological_data_with_distances, *Real_valued_topological_data **/ class Persistence_intervals { public: /** * This is a constructor of a class Persistence_intervals from a text file. Each line of the input file is supposed to *contain two numbers of a type double (or convertible to double) * representing the birth and the death of the persistence interval. If the pairs are not sorted so that birth <= *death, then the constructor will sort then that way. * * The second parameter of a constructor is a dimension of intervals to be read from a file. If your file contains *only birth-death pairs, use the default value. **/ Persistence_intervals(const char* filename, unsigned dimension = std::numeric_limits::max()); /** * This is a constructor of a class Persistence_intervals from a vector of pairs. Each pair is assumed to represent a *persistence interval. We assume that the first elements of pairs * are smaller or equal the second elements of pairs. **/ Persistence_intervals(const std::vector >& intervals); /** * This procedure returns x-range of a given persistence diagram. **/ std::pair get_x_range() const { double min_ = std::numeric_limits::max(); double max_ = -std::numeric_limits::max(); for (size_t i = 0; i != this->intervals.size(); ++i) { if (this->intervals[i].first < min_) min_ = this->intervals[i].first; if (this->intervals[i].second > max_) max_ = this->intervals[i].second; } return std::make_pair(min_, max_); } /** * This procedure returns y-range of a given persistence diagram. **/ std::pair get_y_range() const { double min_ = std::numeric_limits::max(); double max_ = -std::numeric_limits::max(); for (size_t i = 0; i != this->intervals.size(); ++i) { if (this->intervals[i].second < min_) min_ = this->intervals[i].second; if (this->intervals[i].second > max_) max_ = this->intervals[i].second; } return std::make_pair(min_, max_); } /** * Procedure that compute the vector of lengths of the dominant (i.e. the longest) persistence intervals. The list is *truncated at the parameter of the call where_to_cut (set by default to 100). **/ std::vector length_of_dominant_intervals(size_t where_to_cut = 100) const; /** * Procedure that compute the vector of the dominant (i.e. the longest) persistence intervals. The parameter of *the procedure (set by default to 100) is the number of dominant intervals returned by the procedure. **/ std::vector > dominant_intervals(size_t where_to_cut = 100) const; /** * Procedure to compute a histogram of interval's length. A histogram is a block plot. The number of blocks is *determined by the first parameter of the function (set by default to 10). * For the sake of argument let us assume that the length of the longest interval is 1 and the number of bins is *10. In this case the i-th block correspond to a range between i-1/10 and i10. * The vale of a block supported at the interval is the number of persistence intervals of a length between x_0 *and x_1. **/ std::vector histogram_of_lengths(size_t number_of_bins = 10) const; /** * Based on a histogram of intervals lengths computed by the function histogram_of_lengths H the procedure below *computes the cumulative histogram. The i-th position of the resulting histogram * is the sum of values of H for the positions from 0 to i. **/ std::vector cumulative_histogram_of_lengths(size_t number_of_bins = 10) const; /** * In this procedure we assume that each barcode is a characteristic function of a hight equal to its length. The *persistence diagram is a sum of such a functions. The procedure below construct a function being a * sum of the characteristic functions of persistence intervals. The first two parameters are the range in which the *function is to be computed and the last parameter is the number of bins in * the discretization of the interval [_min,_max]. **/ std::vector characteristic_function_of_diagram(double x_min, double x_max, size_t number_of_bins = 10) const; /** * Cumulative version of the function characteristic_function_of_diagram **/ std::vector cumulative_characteristic_function_of_diagram(double x_min, double x_max, size_t number_of_bins = 10) const; /** * Compute the function of persistence Betti numbers. The returned value is a vector of pair. First element of each *pair is a place where persistence Betti numbers change. * Second element of each pair is the value of Persistence Betti numbers at that point. **/ std::vector > compute_persistent_betti_numbers() const; /** *This is a non optimal procedure that compute vector of distances from each point of diagram to its k-th nearest *neighbor (k is a parameter of the program). The resulting vector is by default truncated to 10 *elements (this value can be changed by using the second parameter of the program). The points are returned in order *from the ones which are farthest away from their k-th nearest neighbors. **/ std::vector k_n_n(size_t k, size_t where_to_cut = 10) const; /** * Operator that send the diagram to a stream. **/ friend std::ostream& operator<<(std::ostream& out, const Persistence_intervals& intervals) { for (size_t i = 0; i != intervals.intervals.size(); ++i) { out << intervals.intervals[i].first << " " << intervals.intervals[i].second << std::endl; } return out; } /** * Generating gnuplot script to plot the interval. **/ void plot(const char* filename, double min_x = std::numeric_limits::max(), double max_x = std::numeric_limits::max(), double min_y = std::numeric_limits::max(), double max_y = std::numeric_limits::max()) const { // this program create a gnuplot script file that allows to plot persistence diagram. std::ofstream out; std::stringstream gnuplot_script; gnuplot_script << filename << "_GnuplotScript"; out.open(gnuplot_script.str().c_str()); std::pair min_max_values = this->get_x_range(); if (min_x == max_x) { out << "set xrange [" << min_max_values.first - 0.1 * (min_max_values.second - min_max_values.first) << " : " << min_max_values.second + 0.1 * (min_max_values.second - min_max_values.first) << " ]" << std::endl; out << "set yrange [" << min_max_values.first - 0.1 * (min_max_values.second - min_max_values.first) << " : " << min_max_values.second + 0.1 * (min_max_values.second - min_max_values.first) << " ]" << std::endl; } else { out << "set xrange [" << min_x << " : " << max_x << " ]" << std::endl; out << "set yrange [" << min_y << " : " << max_y << " ]" << std::endl; } out << "plot '-' using 1:2 notitle \"" << filename << "\", \\" << std::endl; out << " '-' using 1:2 notitle with lp" << std::endl; for (size_t i = 0; i != this->intervals.size(); ++i) { out << this->intervals[i].first << " " << this->intervals[i].second << std::endl; } out << "EOF" << std::endl; out << min_max_values.first - 0.1 * (min_max_values.second - min_max_values.first) << " " << min_max_values.first - 0.1 * (min_max_values.second - min_max_values.first) << std::endl; out << min_max_values.second + 0.1 * (min_max_values.second - min_max_values.first) << " " << min_max_values.second + 0.1 * (min_max_values.second - min_max_values.first) << std::endl; out.close(); std::cout << "To visualize, install gnuplot and type the command: gnuplot -persist -e \"load \'" << gnuplot_script.str().c_str() << "\'\"" << std::endl; } /** * Return number of points in the diagram. **/ size_t size() const { return this->intervals.size(); } /** * Return the persistence interval at the given position. Note that intervals are not sorted with respect to their *lengths. **/ inline std::pair operator[](size_t i) const { if (i >= this->intervals.size()) throw("Index out of range! Operator [], one_d_gaussians class\n"); return this->intervals[i]; } // Implementations of functions for various concepts. /** * This is a simple function projecting the persistence intervals to a real number. The function we use here is a sum *of squared lengths of intervals. It can be naturally interpreted as * sum of step function, where the step hight it equal to the length of the interval. * At the moment this function is not tested, since it is quite likely to be changed in the future. Given this, when *using it, keep in mind that it * will be most likely changed in the next versions. **/ double project_to_R(int number_of_function) const; /** * The function gives the number of possible projections to R. This function is required by the *Real_valued_topological_data concept. **/ size_t number_of_projections_to_R() const { return this->number_of_functions_for_projections_to_reals; } /** * Return a family of vectors obtained from the persistence diagram. The i-th vector consist of the length of i *dominant persistence intervals. **/ std::vector vectorize(int number_of_function) const { return this->length_of_dominant_intervals(number_of_function); } /** * This function return the number of functions that allows vectorization of a persistence diagram. It is required *in a concept Vectorized_topological_data. **/ size_t number_of_vectorize_functions() const { return this->number_of_functions_for_vectorization; } // end of implementation of functions needed for concepts. // For visualization use output from vectorize and build histograms. std::vector > output_for_visualization() { return this->intervals; } protected: void set_up_numbers_of_functions_for_vectorization_and_projections_to_reals() { // warning, this function can be only called after filling in the intervals vector. this->number_of_functions_for_vectorization = this->intervals.size(); this->number_of_functions_for_projections_to_reals = 1; } std::vector > intervals; size_t number_of_functions_for_vectorization; size_t number_of_functions_for_projections_to_reals; }; Persistence_intervals::Persistence_intervals(const char* filename, unsigned dimension) { if (dimension == std::numeric_limits::max()) { this->intervals = read_persistence_intervals_in_one_dimension_from_file(filename); } else { this->intervals = read_persistence_intervals_in_one_dimension_from_file(filename, dimension); } this->set_up_numbers_of_functions_for_vectorization_and_projections_to_reals(); } // Persistence_intervals Persistence_intervals::Persistence_intervals(const std::vector >& intervals_) : intervals(intervals_) { this->set_up_numbers_of_functions_for_vectorization_and_projections_to_reals(); } std::vector Persistence_intervals::length_of_dominant_intervals(size_t where_to_cut) const { std::vector result(this->intervals.size()); for (size_t i = 0; i != this->intervals.size(); ++i) { result[i] = this->intervals[i].second - this->intervals[i].first; } std::sort(result.begin(), result.end(), std::greater()); result.resize(std::min(where_to_cut, result.size())); return result; } // length_of_dominant_intervals bool compare(const std::pair& first, const std::pair& second) { return first.second > second.second; } std::vector > Persistence_intervals::dominant_intervals(size_t where_to_cut) const { bool dbg = false; std::vector > position_length_vector(this->intervals.size()); for (size_t i = 0; i != this->intervals.size(); ++i) { position_length_vector[i] = std::make_pair(i, this->intervals[i].second - this->intervals[i].first); } std::sort(position_length_vector.begin(), position_length_vector.end(), compare); std::vector > result; result.reserve(std::min(where_to_cut, position_length_vector.size())); for (size_t i = 0; i != std::min(where_to_cut, position_length_vector.size()); ++i) { result.push_back(this->intervals[position_length_vector[i].first]); if (dbg) std::cerr << "Position : " << position_length_vector[i].first << " length : " << position_length_vector[i].second << std::endl; } return result; } // dominant_intervals std::vector Persistence_intervals::histogram_of_lengths(size_t number_of_bins) const { bool dbg = false; if (dbg) std::cerr << "this->intervals.size() : " << this->intervals.size() << std::endl; // first find the length of the longest interval: double lengthOfLongest = 0; for (size_t i = 0; i != this->intervals.size(); ++i) { if ((this->intervals[i].second - this->intervals[i].first) > lengthOfLongest) { lengthOfLongest = this->intervals[i].second - this->intervals[i].first; } } if (dbg) { std::cerr << "lengthOfLongest : " << lengthOfLongest << std::endl; } // this is a container we will use to store the resulting histogram std::vector result(number_of_bins + 1, 0); // for every persistence interval in our collection. for (size_t i = 0; i != this->intervals.size(); ++i) { // compute its length relative to the length of the dominant interval: double relative_length_of_this_interval = (this->intervals[i].second - this->intervals[i].first) / lengthOfLongest; // given the relative length (between 0 and 1) compute to which bin should it contribute. size_t position = (size_t)(relative_length_of_this_interval * number_of_bins); ++result[position]; if (dbg) { std::cerr << "i : " << i << std::endl; std::cerr << "Interval : [" << this->intervals[i].first << " , " << this->intervals[i].second << " ] \n"; std::cerr << "relative_length_of_this_interval : " << relative_length_of_this_interval << std::endl; std::cerr << "position : " << position << std::endl; getchar(); } } // we want number of bins equals to number_of_bins (some unexpected results on x86) result[number_of_bins-1]+=result[number_of_bins]; result.resize(number_of_bins); if (dbg) { for (size_t i = 0; i != result.size(); ++i) std::cerr << result[i] << std::endl; } return result; } std::vector Persistence_intervals::cumulative_histogram_of_lengths(size_t number_of_bins) const { std::vector histogram = this->histogram_of_lengths(number_of_bins); std::vector result(histogram.size()); size_t sum = 0; for (size_t i = 0; i != histogram.size(); ++i) { sum += histogram[i]; result[i] = sum; } return result; } std::vector Persistence_intervals::characteristic_function_of_diagram(double x_min, double x_max, size_t number_of_bins) const { bool dbg = false; std::vector result(number_of_bins); std::fill(result.begin(), result.end(), 0); for (size_t i = 0; i != this->intervals.size(); ++i) { if (dbg) { std::cerr << "Interval : " << this->intervals[i].first << " , " << this->intervals[i].second << std::endl; } size_t beginIt = 0; if (this->intervals[i].first < x_min) beginIt = 0; if (this->intervals[i].first >= x_max) beginIt = result.size(); if ((this->intervals[i].first > x_min) && (this->intervals[i].first < x_max)) { beginIt = number_of_bins * (this->intervals[i].first - x_min) / (x_max - x_min); } size_t endIt = 0; if (this->intervals[i].second < x_min) endIt = 0; if (this->intervals[i].second >= x_max) endIt = result.size(); if ((this->intervals[i].second > x_min) && (this->intervals[i].second < x_max)) { endIt = number_of_bins * (this->intervals[i].second - x_min) / (x_max - x_min); } if (beginIt > endIt) { beginIt = endIt; } if (dbg) { std::cerr << "beginIt : " << beginIt << std::endl; std::cerr << "endIt : " << endIt << std::endl; } for (size_t pos = beginIt; pos != endIt; ++pos) { result[pos] += ((x_max - x_min) / static_cast(number_of_bins)) * (this->intervals[i].second - this->intervals[i].first); } if (dbg) { std::cerr << "Result at this stage \n"; for (size_t aa = 0; aa != result.size(); ++aa) { std::cerr << result[aa] << " "; } std::cerr << std::endl; } } return result; } // characteristic_function_of_diagram std::vector Persistence_intervals::cumulative_characteristic_function_of_diagram(double x_min, double x_max, size_t number_of_bins) const { std::vector intsOfBars = this->characteristic_function_of_diagram(x_min, x_max, number_of_bins); std::vector result(intsOfBars.size()); double sum = 0; for (size_t i = 0; i != intsOfBars.size(); ++i) { sum += intsOfBars[i]; result[i] = sum; } return result; } // cumulative_characteristic_function_of_diagram template bool compare_first_element_of_pair(const std::pair& f, const std::pair& s) { return (f.first < s.first); } std::vector > Persistence_intervals::compute_persistent_betti_numbers() const { std::vector > places_where_pbs_change(2 * this->intervals.size()); for (size_t i = 0; i != this->intervals.size(); ++i) { places_where_pbs_change[2 * i] = std::make_pair(this->intervals[i].first, true); places_where_pbs_change[2 * i + 1] = std::make_pair(this->intervals[i].second, false); } std::sort(places_where_pbs_change.begin(), places_where_pbs_change.end(), compare_first_element_of_pair); size_t pbn = 0; std::vector > pbns(places_where_pbs_change.size()); for (size_t i = 0; i != places_where_pbs_change.size(); ++i) { if (places_where_pbs_change[i].second == true) { ++pbn; } else { --pbn; } pbns[i] = std::make_pair(places_where_pbs_change[i].first, pbn); } return pbns; } inline double compute_euclidean_distance(const std::pair& f, const std::pair& s) { return sqrt((f.first - s.first) * (f.first - s.first) + (f.second - s.second) * (f.second - s.second)); } std::vector Persistence_intervals::k_n_n(size_t k, size_t where_to_cut) const { bool dbg = false; if (dbg) { std::cerr << "Here are the intervals : \n"; for (size_t i = 0; i != this->intervals.size(); ++i) { std::cerr << "[ " << this->intervals[i].first << " , " << this->intervals[i].second << "] \n"; } getchar(); } std::vector result; // compute all to all distance between point in the diagram. Also, consider points in the diagonal with the infinite // multiplicity. std::vector > distances(this->intervals.size()); for (size_t i = 0; i != this->intervals.size(); ++i) { std::vector aa(this->intervals.size()); std::fill(aa.begin(), aa.end(), 0); distances[i] = aa; } std::vector distances_from_diagonal(this->intervals.size()); std::fill(distances_from_diagonal.begin(), distances_from_diagonal.end(), 0); for (size_t i = 0; i != this->intervals.size(); ++i) { std::vector distancesFromI; for (size_t j = i + 1; j != this->intervals.size(); ++j) { distancesFromI.push_back(compute_euclidean_distance(this->intervals[i], this->intervals[j])); } // also add a distance from this guy to diagonal: double distanceToDiagonal = compute_euclidean_distance( this->intervals[i], std::make_pair(0.5 * (this->intervals[i].first + this->intervals[i].second), 0.5 * (this->intervals[i].first + this->intervals[i].second))); distances_from_diagonal[i] = distanceToDiagonal; if (dbg) { std::cerr << "Here are the distances form the point : [" << this->intervals[i].first << " , " << this->intervals[i].second << "] in the diagram \n"; for (size_t aa = 0; aa != distancesFromI.size(); ++aa) { std::cerr << "To : " << i + aa << " : " << distancesFromI[aa] << " "; } std::cerr << std::endl; getchar(); } // filling in the distances matrix: for (size_t j = i + 1; j != this->intervals.size(); ++j) { distances[i][j] = distancesFromI[j - i - 1]; distances[j][i] = distancesFromI[j - i - 1]; } } if (dbg) { std::cerr << "Here is the distance matrix : \n"; for (size_t i = 0; i != distances.size(); ++i) { for (size_t j = 0; j != distances.size(); ++j) { std::cerr << distances[i][j] << " "; } std::cerr << std::endl; } std::cerr << std::endl << std::endl << "And here are the distances to the diagonal : " << std::endl; for (size_t i = 0; i != distances_from_diagonal.size(); ++i) { std::cerr << distances_from_diagonal[i] << " "; } std::cerr << std::endl << std::endl; getchar(); } for (size_t i = 0; i != this->intervals.size(); ++i) { std::vector distancesFromI = distances[i]; distancesFromI.push_back(distances_from_diagonal[i]); // sort it: std::sort(distancesFromI.begin(), distancesFromI.end(), std::greater()); if (k > distancesFromI.size()) { if (dbg) { std::cerr << "There are not enough neighbors in your set. We set the result to plus infty \n"; } result.push_back(std::numeric_limits::max()); } else { if (distances_from_diagonal[i] > distancesFromI[k]) { if (dbg) { std::cerr << "The k-th n.n. is on a diagonal. Therefore we set up a distance to diagonal \n"; } result.push_back(distances_from_diagonal[i]); } else { result.push_back(distancesFromI[k]); } } } std::sort(result.begin(), result.end(), std::greater()); result.resize(std::min(result.size(), where_to_cut)); return result; } double Persistence_intervals::project_to_R(int number_of_function) const { double result = 0; for (size_t i = 0; i != this->intervals.size(); ++i) { result += (this->intervals[i].second - this->intervals[i].first) * (this->intervals[i].second - this->intervals[i].first); } return result; } } // namespace Persistence_representations } // namespace Gudhi #endif // PERSISTENCE_INTERVALS_H_