diff options
author | pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb> | 2016-12-02 09:56:24 +0000 |
---|---|---|
committer | pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb> | 2016-12-02 09:56:24 +0000 |
commit | d76207c6fab90c7a55a5851a9da03e617ad66883 (patch) | |
tree | aea2e0ff70c26cedb99f5d29c2d21c8642b8958a /src/Gudhi_stat/include/gudhi/fill_in_missing_data.h | |
parent | 05c4c86f272bfa4f50a512f3db112557e81eb30d (diff) |
Adding a (first version of a) procedure that allows filling in missing data. The process is based on linear approximation for agiven representation of persistence.
git-svn-id: svn+ssh://scm.gforge.inria.fr/svnroot/gudhi/branches/gudhi_stat@1813 636b058d-ea47-450e-bf9e-a15bfbe3eedb
Former-commit-id: d86cb3e540e67ea254aa067c7a4490bea4ad855b
Diffstat (limited to 'src/Gudhi_stat/include/gudhi/fill_in_missing_data.h')
-rw-r--r-- | src/Gudhi_stat/include/gudhi/fill_in_missing_data.h | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h b/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h new file mode 100644 index 00000000..eb948438 --- /dev/null +++ b/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h @@ -0,0 +1,136 @@ +/* This file is part of the Gudhi Library. The Gudhi library + * (Geometric Understanding in Higher Dimensions) is a generic C++ + * library for computational topology. + * + * Author(s): Pawel Dlotko + * + * Copyright (C) 2015 INRIA (France) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef FILL_IN_MISSING_DATA_H +#define FILL_IN_MISSING_DATA_H + +/** + * Quite often in biological sciences we are facing a problem of missing data. We may have for instance a number of sequences of observations made in between times A and B in a discrete + * collection of times A = t1, t2,...,tn = B. But quite typically some of the observations may be missing. Then quite often it is hard to estimate the values in the missing times. + * The procedure below assumes that we compute some topological descriptor of the observations we are given. Most typically this will be some type of persistence homology representation. + * The the values in the missing points are filled in by linear interpolation and extrapolation. The procedure below have minimal requirements: it assumes that we have two elements that + * filled in. The rest will be filled in by using them. + * The fill-in process is done based on the idea of linear approximation. Let us assume that we have two positions A and B which are filled in with proper objects of a type Representation_of_topology. + * Any intermediate time step can be interpolated by taking a linear combination t*A + (1-t)*B, where t \in [0,1]. + * If the missing data are located at the beginning of at the end of vector of Representation_of_topology, then we use the following extrapolation scheme: + * First we make sure that there are no missing data except at the very beginning and at the very end of a vector of data. Then, we pick two constitutive closest filled-in data A and B and we extrapolate + * by using a formula: t*A-(t-1)*B, where t is a natural number > 1. + * Note that both vector of data and the vector is_the_position_filled_in are modified by this procedure. Upon successful termination of the procedure, the vector is_the_position_filled_in has only 'true' entries, + * and the vector data do not have missing data. +**/ + +template < typename Representation_of_topology > +void fill_in_missing_data( std::vector< Representation_of_topology* >& data , std::vector< bool >& is_the_position_filled_in ) +{ + bool dbg = false; + + //first check if at least two positions are filled in: + size_t number_of_positions_that_are_filled_in = 0; + for ( size_t i = 0 ; i != is_the_position_filled_in.size() ; ++i ) + { + if ( is_the_position_filled_in[i] )++number_of_positions_that_are_filled_in; + } + + if ( number_of_positions_that_are_filled_in < 2 ) + { + std::cerr << "There are too few positions filled in to do extrapolation / interpolation. The program will now terminate.\n"; + throw "There are too few positions filled in to do extrapolation / interpolation. The program will now terminate.\n"; + } + + for ( size_t i = 0 ; i != data.size() ; ++i ) + { + if ( !is_the_position_filled_in[i] ) + { + //This position is not filled in. Find the next position which is nonzero. + size_t j = 1; + while ( (is_the_position_filled_in[i+j] == false) && ( i+j != data.size() ) )++j; + if ( dbg ) + { + std::cout << "The position number : " << i << " is not filled in. The next filled-in position is : " << i+j << endl; + } + if ( i != 0 ) + { + //this is not the first position of the data: + if ( i + j != data.size() ) + { + //this is not the last position of the data either + for ( size_t k = 0 ; k != j ; ++k ) + { + double weight1 = double(j-k)/(double)(j+1); + double weight2 = double(k+1)/(double)(j+1); + data[i+k] = new Representation_of_topology(weight1*(*data[i-1]) + weight2*(*data[i+j])); + is_the_position_filled_in[i+k] = true; + if ( dbg ) + { + cerr << "We fill in a position : " << i+k << " with: position " << i-1 << " with weight " << weight1 << " and position " << i+j << " with weight " << weight2 << endl; + } + } + } + } + else + { + //this is the first position of the data, i.e. i == 0. + while ( is_the_position_filled_in[i] == 0 )++i; + } + + } + } + + + if ( is_the_position_filled_in[0] == false ) + { + //find the first nonzero (then, we know that the second one will be nonzero too, since we filled it in above): + size_t i = 0; + while ( is_the_position_filled_in[i] == false )++i; + //the data at position i is declared. Since, we made sure that all other positions, except maybe a few first and a few last, are declared too. Therefore, if we find a + //first declared, then the next one will be declared too. So, we can do a telescopic declaration backward and forward. + + for ( size_t j = i ; j != 0 ; --j ) + { + data[j-1] = new Representation_of_topology(2*(*data[j]) + (-1)*(*data[j+1])); + is_the_position_filled_in[j-1] = true; + if ( dbg ) + { + cerr << "Filling in a position : " << j-1 << " by using: " << j << " and " << j+1 <<endl; + } + } + } + + if ( is_the_position_filled_in[ is_the_position_filled_in.size()-1 ] == false ) + { + //first the first nonzero (then, we know that the second one will be nonzero too): + size_t i = is_the_position_filled_in.size()-1; + while ( is_the_position_filled_in[i] == false )--i; + + for ( size_t j = i+1 ; j != data.size() ; ++j ) + { + data[j] = new Representation_of_topology(2*(*data[j-1]) + (-1)*(*data[j-2])); + is_the_position_filled_in[j] = true; + if ( dbg ) + { + cerr << "Filling in a position : " << j << " by using: " << j-1 << " and " << j-2 <<endl; + } + } + } +}//fill_in_missing_data + +#endif |