From e91f1b9d6987dda58498b8386eba191436b6f6d2 Mon Sep 17 00:00:00 2001 From: Gard Spreemann Date: Thu, 22 Sep 2016 11:01:59 +0200 Subject: * Flexible output, allow writing DIPHA persistence diagrams (when not writing to stdout). * Endian-safe reading and writing (for moving towards a Debian package). For convenience and portability, endian-conversions use Boost. If you are building on a little-endian machine without Boost, you can set the FORCE_NATIVE_ENDIANNESS preprocessor flag to avoid the Boost dependency. --- ripser.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 162 insertions(+), 53 deletions(-) diff --git a/ripser.cpp b/ripser.cpp index 3d4eb29..b0257c8 100644 --- a/ripser.cpp +++ b/ripser.cpp @@ -37,6 +37,10 @@ with this program. If not, see . #include #include +#ifndef FORCE_NATIVE_ENDIANNESS +#include +#endif + #ifdef USE_GOOGLE_HASHMAP #include template class hash_map : public google::sparse_hash_map { @@ -47,12 +51,19 @@ public: template class hash_map : public std::unordered_map {}; #endif +static const int64_t DIPHA_MAGIC = 8067171840; +static const int64_t DIPHA_TYPE_DISTANCE_MATRIX = 7; +static const int64_t DIPHA_TYPE_PERSISTENCE_DIAGRAM = 2; + typedef float value_t; // typedef uint16_t value_t; typedef int64_t index_t; typedef int16_t coefficient_t; +enum input_file_format { LOWER_DISTANCE_MATRIX, UPPER_DISTANCE_MATRIX, DISTANCE_MATRIX, POINT_CLOUD, DIPHA_DISTANCE_MATRIX }; +enum output_file_format { TEXT_PD, DIPHA_PD }; + class binomial_coeff_table { std::vector> B; index_t n_max, k_max; @@ -98,6 +109,22 @@ std::vector multiplicative_inverse_vector(const coefficient_t m) return inverse; } +template T read(std::istream& s) { + T result; + s.read(reinterpret_cast(&result), sizeof(T)); + #ifndef FORCE_NATIVE_ENDIANNESS + boost::endian::little_to_native_inplace(result); + #endif + return result; +} + +template void write(std::ostream& s, T value) { + #ifndef FORCE_NATIVE_ENDIANNESS + boost::endian::native_to_little_inplace(value); + #endif + s.write(reinterpret_cast(&value), sizeof(T)); +} + template OutputIterator get_simplex_vertices(index_t idx, const index_t dim, index_t n, const binomial_coeff_table& binomial_coeff, OutputIterator out) { @@ -504,11 +531,12 @@ void compute_pairs(std::vector& columns_to_reduce, hash_map& multiplicative_inverse, - const binomial_coeff_table& binomial_coeff) { + const binomial_coeff_table& binomial_coeff, + std::ostream& output_stream, output_file_format oformat, int64_t & pair_count) { -#ifdef PRINT_PERSISTENCE_PAIRS - std::cout << "persistence intervals in dim " << dim << ":" << std::endl; -#endif + if (oformat == TEXT_PD) { + output_stream << "persistence intervals in dim " << dim << ":" << std::endl; + } #ifdef ASSEMBLE_REDUCTION_MATRIX compressed_sparse_matrix reduction_matrix; @@ -629,25 +657,35 @@ void compute_pairs(std::vector& columns_to_reduce, hash_map(output_stream, -dim - 1); // Essential class in dimension dim + write(output_stream, diameter); + write(output_stream, NAN); // Undefined/unused for essential classes. + pair_count++; + } break; } found_persistence_pair: -#ifdef PRINT_PERSISTENCE_PAIRS value_t death = get_diameter(pivot); if (diameter != death) { #ifdef INDICATE_PROGRESS std::cout << "\033[K"; #endif - std::cout << " [" << diameter << "," << death << ")" << std::endl << std::flush; + if (oformat == TEXT_PD) { + output_stream << " [" << diameter << "," << death << ")" << std::endl << std::flush; + } else if (oformat == DIPHA_PD) { + write(output_stream, dim); + write(output_stream, diameter); + write(output_stream, death); + pair_count++; + } } -#endif pivot_column_index.insert(std::make_pair(get_index(pivot), i)); @@ -686,14 +724,6 @@ void compute_pairs(std::vector& columns_to_reduce, hash_map T read(std::istream& s) { - T result; - s.read(reinterpret_cast(&result), sizeof(T)); - return result; // on little endian: boost::endian::little_to_native(result); -} - compressed_lower_distance_matrix read_point_cloud(std::istream& input_stream) { std::vector> points; @@ -763,13 +793,13 @@ compressed_lower_distance_matrix read_distance_matrix(std::istream& input_stream } compressed_lower_distance_matrix read_dipha(std::istream& input_stream) { - if (read(input_stream) != 8067171840) { - std::cerr << "input is not a Dipha file (magic number: 8067171840)" << std::endl; + if (read(input_stream) != DIPHA_MAGIC) { + std::cerr << "input is not a Dipha file (magic number: " << DIPHA_MAGIC << ")" << std::endl; exit(-1); } - if (read(input_stream) != 7) { - std::cerr << "input is not a Dipha distance matrix (file type: 7)" << std::endl; + if (read(input_stream) != DIPHA_TYPE_DISTANCE_MATRIX) { + std::cerr << "input is not a Dipha distance matrix (file type: " << DIPHA_TYPE_DISTANCE_MATRIX << ")" << std::endl; exit(-1); } @@ -787,7 +817,7 @@ compressed_lower_distance_matrix read_dipha(std::istream& input_stream) { return compressed_lower_distance_matrix(std::move(distances)); } -compressed_lower_distance_matrix read_file(std::istream& input_stream, file_format format) { +compressed_lower_distance_matrix read_file(std::istream& input_stream, input_file_format format) { switch (format) { case LOWER_DISTANCE_MATRIX: return read_lower_distance_matrix(input_stream); @@ -797,7 +827,7 @@ compressed_lower_distance_matrix read_file(std::istream& input_stream, file_form return read_distance_matrix(input_stream); case POINT_CLOUD: return read_point_cloud(input_stream); - case DIPHA: + case DIPHA_DISTANCE_MATRIX: return read_dipha(input_stream); } } @@ -805,17 +835,20 @@ compressed_lower_distance_matrix read_file(std::istream& input_stream, file_form void print_usage_and_exit(int exit_code) { std::cerr << "Usage: " << "ripser " - << "[options] [filename]" << std::endl + << "[options] [input filename] [output filename]" << std::endl << std::endl << "Options:" << std::endl << std::endl << " --help print this screen" << std::endl - << " --format use the specified file format for the input. Options are:" << std::endl + << " --iformat use the specified file format for the input. Options are:" << std::endl << " lower-distance (lower triangular distance matrix; default)" << std::endl << " upper-distance (upper triangular distance matrix)" << std::endl << " distance (full distance matrix)" << std::endl << " point-cloud (point cloud in Euclidean space)" << std::endl << " dipha (distance matrix in DIPHA file format)" << std::endl + << " --oformat use the specified file format for the output. Options are:" << std::endl + << " text (plain text output; default)" << std::endl + << " dipha (barcode in DIPHA file format)" << std::endl << " --dim compute persistent homology up to dimension " << std::endl << " --threshold compute Rips complexes up to diameter " << std::endl #ifdef USE_COEFFICIENTS @@ -828,9 +861,11 @@ void print_usage_and_exit(int exit_code) { int main(int argc, char** argv) { - const char* filename = nullptr; + const char* ifilename = nullptr; + const char* ofilename = nullptr; - file_format format = DISTANCE_MATRIX; + input_file_format iformat = DISTANCE_MATRIX; + output_file_format oformat = TEXT_PD; index_t dim_max = 1; value_t threshold = std::numeric_limits::max(); @@ -855,18 +890,26 @@ int main(int argc, char** argv) { size_t next_pos; threshold = std::stof(parameter, &next_pos); if (next_pos != parameter.size()) print_usage_and_exit(-1); - } else if (arg == "--format") { + } else if (arg == "--iformat") { std::string parameter = std::string(argv[++i]); if (parameter == "lower-distance") - format = LOWER_DISTANCE_MATRIX; + iformat = LOWER_DISTANCE_MATRIX; else if (parameter == "upper-distance") - format = UPPER_DISTANCE_MATRIX; + iformat = UPPER_DISTANCE_MATRIX; else if (parameter == "distance") - format = DISTANCE_MATRIX; + iformat = DISTANCE_MATRIX; else if (parameter == "point-cloud") - format = POINT_CLOUD; + iformat = POINT_CLOUD; else if (parameter == "dipha") - format = DIPHA; + iformat = DIPHA_DISTANCE_MATRIX; + else + print_usage_and_exit(-1); + } else if (arg == "--oformat") { + std::string parameter = std::string(argv[++i]); + if (parameter == "text") + oformat = TEXT_PD; + else if (parameter == "dipha") + oformat = DIPHA_PD; else print_usage_and_exit(-1); #ifdef USE_COEFFICIENTS @@ -876,19 +919,62 @@ int main(int argc, char** argv) { modulus = std::stol(parameter, &next_pos); if (next_pos != parameter.size() || !is_prime(modulus)) print_usage_and_exit(-1); #endif - } else { - if (filename) { print_usage_and_exit(-1); } - filename = argv[i]; + } else if (i == argc - 2) { + ifilename = argv[i]; + } + else if (i == argc - 1) { + ofilename = argv[i]; } + else { + print_usage_and_exit(-1); + } + } + + if (!ifilename || !ofilename) { + std::cerr << "You need to specify input and output files." << std::endl; + print_usage_and_exit(-1); + } + + if (std::string(ifilename) == std::string("-")) + ifilename = nullptr; + if (std::string(ofilename) == std::string("-")) + ofilename = nullptr; + + std::ifstream input_file_stream(ifilename); + if (ifilename && input_file_stream.fail()) { + std::cerr << "couldn't open file " << ifilename << std::endl; + exit(-1); } - std::ifstream file_stream(filename); - if (filename && file_stream.fail()) { - std::cerr << "couldn't open file " << filename << std::endl; + if (!ofilename && oformat == DIPHA_PD) { + std::cerr << "Warning: Will not write a DIPHA persistence diagram to stdout. Changing output format to text." << std::endl; + oformat = TEXT_PD; + } + + std::ofstream output_file_stream; + if (oformat == TEXT_PD) { + output_file_stream.open(ofilename, std::ios::out); + } + else if (oformat == DIPHA_PD) { + // The in/out opening mode is so that we can seek back to the third third entry in the file and write the pair count at the very end. + output_file_stream.open(ofilename, std::ios::out | std::ios::in | std::ios::trunc | std::ios::binary); + } + + if (ofilename && output_file_stream.fail()) { + std::cerr << "couldn't open file " << ofilename << std::endl; exit(-1); } - compressed_lower_distance_matrix dist = read_file(filename ? file_stream : std::cin, format); + std::ostream & output_stream = ofilename ? output_file_stream : std::cout; + + if (oformat == DIPHA_PD) { + write(output_stream, DIPHA_MAGIC); + write(output_stream, DIPHA_TYPE_PERSISTENCE_DIAGRAM); + write(output_stream, DIPHA_MAGIC); // This is to later be replaced by the number of pairs in the persistence diagram. This is the reason we don't allow binary stdout output, since we'd have to buffer up the entire PD to know the count beforehand. In a file we can seek back and overwrite. + } + int64_t pair_count = 0; + + compressed_lower_distance_matrix dist = read_file(ifilename ? input_file_stream : std::cin, iformat); index_t n = dist.size(); @@ -914,9 +1000,9 @@ int main(int argc, char** argv) { } std::sort(edges.rbegin(), edges.rend(), greater_diameter_or_smaller_index()); -#ifdef PRINT_PERSISTENCE_PAIRS - std::cout << "persistence intervals in dim 0:" << std::endl; -#endif + if (oformat == TEXT_PD) { + output_stream << "persistence intervals in dim 0:" << std::endl; + } std::vector vertices_of_edge(2); for (auto e : edges) { @@ -925,19 +1011,33 @@ int main(int argc, char** argv) { index_t u = dset.find(vertices_of_edge[0]), v = dset.find(vertices_of_edge[1]); if (u != v) { -#ifdef PRINT_PERSISTENCE_PAIRS - std::cout << " [0," << get_diameter(e) << ")" << std::endl; -#endif + if (oformat == TEXT_PD) { + output_stream << " [0," << get_diameter(e) << ")" << std::endl; + } else if (oformat == DIPHA_PD) { + write(output_stream, 0); + write(output_stream, 0); + write(output_stream, get_diameter(e)); + pair_count++; + } dset.link(u, v); } else columns_to_reduce.push_back(e); } std::reverse(columns_to_reduce.begin(), columns_to_reduce.end()); -#ifdef PRINT_PERSISTENCE_PAIRS - for (index_t i = 0; i < n; ++i) - if (dset.find(i) == i) std::cout << " [0, )" << std::endl << std::flush; -#endif + for (index_t i = 0; i < n; ++i) + { + if (dset.find(i) == i) { + if (oformat == TEXT_PD) { + output_stream << " [0, )" << std::endl << std::flush; + } else if (oformat == DIPHA_PD) { + write(output_stream, -0 - 1); // Indicate essential class. + write(output_stream, 0); + write(output_stream, NAN); // Undefined/unused for essential classes. + pair_count++; + } + } + } } for (index_t dim = 1; dim <= dim_max; ++dim) { @@ -948,10 +1048,19 @@ int main(int argc, char** argv) { pivot_column_index.reserve(columns_to_reduce.size()); compute_pairs(columns_to_reduce, pivot_column_index, dist, comp, comp_prev, dim, n, threshold, modulus, - multiplicative_inverse, binomial_coeff); + multiplicative_inverse, binomial_coeff, output_stream, oformat, pair_count); if (dim < dim_max) { assemble_columns_to_reduce(columns_to_reduce, pivot_column_index, comp, dim, n, threshold, binomial_coeff); } } -} \ No newline at end of file + + + if (oformat == DIPHA_PD) { + output_file_stream.seekp(2*sizeof(int64_t), std::ios_base::beg); // Rewind to the spot in the DIPHA file where we write the number of intervals. + write(output_file_stream, pair_count); + } + + input_file_stream.close(); + output_file_stream.close(); +} -- cgit v1.2.3