Adding a (first version of a) procedure that allows filling in missing data. The process is based on linear approximation for agiven representation of persistence.

git-svn-id: svn+ssh://scm.gforge.inria.fr/svnroot/gudhi/branches/gudhi_stat@1813 636b058d-ea47-450e-bf9e-a15bfbe3eedb Former-commit-id: d86cb3e540e67ea254aa067c7a4490bea4ad855b
author: pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb> 2016-12-02 09:56:24 +0000
committer: pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb> 2016-12-02 09:56:24 +0000
commit: d76207c6fab90c7a55a5851a9da03e617ad66883 (patch)
tree: aea2e0ff70c26cedb99f5d29c2d21c8642b8958a
parent: 05c4c86f272bfa4f50a512f3db112557e81eb30d (diff)
3 files changed, 153 insertions, 11 deletions
diff --git a/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h b/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h
new file mode 100644
index 00000000..eb948438
--- /dev/null
+++ b/src/Gudhi_stat/include/gudhi/fill_in_missing_data.h
@@ -0,0 +1,136 @@
+/*    This file is part of the Gudhi Library. The Gudhi library
+ *    (Geometric Understanding in Higher Dimensions) is a generic C++
+ *    library for computational topology.
+ *
+ *    Author(s):       Pawel Dlotko
+ *
+ *    Copyright (C) 2015  INRIA (France)
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the GNU General Public License as published by
+ *    the Free Software Foundation, either version 3 of the License, or
+ *    (at your option) any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef FILL_IN_MISSING_DATA_H
+#define FILL_IN_MISSING_DATA_H
+
+/**
+ * Quite often in biological sciences we are facing a problem of missing data. We may have for instance a number of sequences of observations made in between times A and B in a discrete 
+ * collection of times A = t1, t2,...,tn = B. But quite typically some of the observations may be missing. Then quite often it is hard to estimate the values in the missing times. 
+ * The procedure below assumes that we compute some topological descriptor of the observations we are given. Most typically this will be some type of persistence homology representation.
+ * The the values in the missing points are filled in by linear interpolation and extrapolation. The procedure below have minimal requirements: it assumes that we have two elements that 
+ * filled in. The rest will be filled in by using them.
+ * The fill-in process is done based on the idea of linear approximation. Let us assume that we have two positions A and B which are filled in with proper objects of a type Representation_of_topology.
+ * Any intermediate time step can be interpolated by taking a linear combination t*A + (1-t)*B, where t \in [0,1]. 
+ * If the missing data are located at the beginning of at the end of vector of Representation_of_topology, then we use the following extrapolation scheme:
+ * First we make sure that there are no missing data except at the very beginning and at the very end of a vector of data. Then, we pick two constitutive closest filled-in data A and B and we extrapolate
+ * by using a formula: t*A-(t-1)*B, where t is a natural number > 1. 
+ * Note that both vector of data and the vector is_the_position_filled_in are modified by this procedure. Upon successful termination of the procedure, the vector is_the_position_filled_in has only 'true' entries,
+ * and the vector data do not have missing data. 
+**/  
+
+template < typename Representation_of_topology >
+void fill_in_missing_data( std::vector< Representation_of_topology* >& data , std::vector< bool >& is_the_position_filled_in )
+{
+	bool dbg = false;
+	
+	//first check if at least two positions are filled in:
+	size_t number_of_positions_that_are_filled_in = 0;
+	for ( size_t i = 0 ; i != is_the_position_filled_in.size() ; ++i )
+	{
+		if ( is_the_position_filled_in[i] )++number_of_positions_that_are_filled_in;
+	}
+	
+	if ( number_of_positions_that_are_filled_in < 2 )
+	{
+		std::cerr << "There are too few positions filled in to do extrapolation / interpolation. The program will now terminate.\n";
+		throw "There are too few positions filled in to do extrapolation / interpolation. The program will now terminate.\n";
+	}
+	
+	for ( size_t i = 0 ; i != data.size() ; ++i )
+	{
+		if ( !is_the_position_filled_in[i] )
+		{				
+			//This position is not filled in. Find the next position which is nonzero.
+			size_t j = 1;
+			while ( (is_the_position_filled_in[i+j] == false) && ( i+j != data.size() ) )++j;
+			if ( dbg )
+			{
+				std::cout << "The position number : " << i << " is not filled in. The next filled-in position is : " << i+j << endl;
+			}
+			if ( i != 0 )
+			{
+				//this is not the first position of the data:
+				if ( i + j != data.size() )
+				{
+					//this is not the last position of the data either
+					for ( size_t k = 0 ; k != j ; ++k )
+					{
+						double weight1 = double(j-k)/(double)(j+1);
+						double weight2 = double(k+1)/(double)(j+1);							
+						data[i+k] = new Representation_of_topology(weight1*(*data[i-1]) + weight2*(*data[i+j]));
+						is_the_position_filled_in[i+k] = true;
+						if ( dbg )
+						{
+							cerr << "We fill in a position : " << i+k << " with: position " << i-1 << " with weight " << weight1 << " and position " << i+j << " with weight " << weight2 << endl;
+						}
+					}
+				}					
+			}
+			else
+			{
+				//this is the first position of the data, i.e. i == 0.
+				while ( is_the_position_filled_in[i] == 0 )++i;
+			}
+			
+		}
+	}					
+		
+	
+	if ( is_the_position_filled_in[0] == false )
+	{
+		//find the first nonzero (then, we know that the second one will be nonzero too, since we filled it in above):
+		size_t i = 0;
+		while ( is_the_position_filled_in[i] == false )++i;
+		//the data at position i is declared. Since, we made sure that all other positions, except maybe a few first and a few last, are declared too. Therefore, if we find a 
+		//first declared, then the next one will be declared too. So, we can do a telescopic declaration backward and forward.
+				
+		for ( size_t j = i ; j != 0 ; --j )
+		{
+			data[j-1] = new Representation_of_topology(2*(*data[j]) + (-1)*(*data[j+1]));
+			is_the_position_filled_in[j-1] = true;
+			if ( dbg )
+			{
+				cerr << "Filling in a position : " << j-1 << " by using: " << j << " and " << j+1 <<endl;
+			}
+		}
+	}
+	
+	if ( is_the_position_filled_in[ is_the_position_filled_in.size()-1 ] == false )
+	{
+		//first the first nonzero (then, we know that the second one will be nonzero too):
+		size_t i = is_the_position_filled_in.size()-1;
+		while ( is_the_position_filled_in[i] == false )--i;
+		
+		for ( size_t j = i+1 ; j != data.size() ; ++j )
+		{
+			data[j] = new Representation_of_topology(2*(*data[j-1]) + (-1)*(*data[j-2]));
+			is_the_position_filled_in[j] = true;
+			if ( dbg )
+			{
+				cerr << "Filling in a position : " << j << " by using: " << j-1 << " and " << j-2 <<endl;
+			}
+		}
+	}
+}//fill_in_missing_data
+
+#endif
diff --git a/src/Gudhi_stat/include/gudhi/multiplicative_bootstrap.h b/src/Gudhi_stat/include/gudhi/multiplicative_bootstrap.h
index fc7a8091..daac8f2a 100644
--- a/src/Gudhi_stat/include/gudhi/multiplicative_bootstrap.h
+++ b/src/Gudhi_stat/include/gudhi/multiplicative_bootstrap.h
@@ -56,7 +56,7 @@ public:
 	double operator()( const TopologicalObject& obj )const
 	{
 		TopologicalObject* empty = new TopologicalObject;	
-		double dist = empty->distance( obj );   
+		double dist = empty->distance( obj , -1 );   
 		delete empty;
 		return dist;
 	}
@@ -67,8 +67,6 @@ public:
 /**
 * This is a generic function to perform multiplicative bootstrap.
 **/
-
-
 template < typename TopologicalObject , typename OperationOnObjects , typename NormOnObjects >
 double multiplicative_bootstrap( const std::vector< TopologicalObject* >& topological_objects , size_t number_of_bootstrap_operations , const OperationOnObjects& oper , const NormOnObjects& norm ,  double quantile = 0.95 )
 {
@@ -99,14 +97,19 @@ double multiplicative_bootstrap( const std::vector< TopologicalObject* >& topolo
 		{
 			std::cout << "Still : " << number_of_bootstrap_operations-it_no << " tests to go. \n The subsampled vector consist of points number : ";
 		}				
+		
+		
 		//and compute the intermediate characteristic:		
 		TopologicalObject result;
 		for ( size_t i = 0 ; i != topological_objects.size() ; ++i )
-		{
-			double rand_variable = norm_distribution( generator );				
-			result += rand_variable*oper(*(topological_objects[i]) , average);
+		{				
+			double rand_variable = norm_distribution( generator );
+			result = result + rand_variable*oper(*(topological_objects[i]) , average);
 		}
-		result = result*(1.0/sqrt( topological_objects.size() ));									
+		//HERE THE NORM SEEMS TO BE MISSING!!
+		result = result.abs();
+		result = result*(1.0/sqrt( topological_objects.size() ));											
+		//NEED TO TAKE MAX
 		vector_of_intermediate_characteristics[it_no] = norm(result);		
 	}
 	#ifdef GUDHI_USE_TBB
diff --git a/src/Gudhi_stat/include/gudhi/permutation_test.h b/src/Gudhi_stat/include/gudhi/permutation_test.h
index ac3a7f53..7f8f7552 100644
--- a/src/Gudhi_stat/include/gudhi/permutation_test.h
+++ b/src/Gudhi_stat/include/gudhi/permutation_test.h
@@ -41,7 +41,7 @@ namespace Gudhi_stat
 template <typename Representation_of_persistence>
 double permutation_test( const std::vector<Representation_of_persistence*>& data_1 , const std::vector<Representation_of_persistence*>& data_2 , size_t number_of_permutations , double exponent = 1 )
 {	
-	bool dbg = true;
+	bool dbg = false;
 	try
 	{
 		Representation_of_persistence av_1;// = new Representation_of_persistence;		
@@ -53,6 +53,8 @@ double permutation_test( const std::vector<Representation_of_persistence*>& data
 		
 		double counter = 0;
 		
+		
+		//TODO -- TBB. QUestion, why it uses memory. Where is the leak??
 		for  ( size_t i = 0 ; i != number_of_permutations ; ++i )
 		{					
 			std::vector<Representation_of_persistence*> all_data;
@@ -65,10 +67,11 @@ double permutation_test( const std::vector<Representation_of_persistence*>& data
 		
 		
 			Representation_of_persistence av_1;// = new Representation_of_persistence;		
-			Representation_of_persistence av_2;// = new Representation_of_persistence;		
+			Representation_of_persistence av_2;// = new Representation_of_persistence;	
+				
 			av_1.compute_average( first_part );					
 			av_2.compute_average( second_part );			
-			double distance_after_permutations = av_1.distance( av_2 , exponent );
+			double distance_after_permutations = av_1.distance( av_2 , exponent );			
 			//delete av_1;
 			//delete av_2;
 			if ( distance_after_permutations > initial_distance )++counter;
@@ -78,7 +81,7 @@ double permutation_test( const std::vector<Representation_of_persistence*>& data
 				std::cerr << "distance_after_permutations : " << distance_after_permutations << std::endl;
 				std::cerr << "initial_distance : " << initial_distance << std::endl;
 				std::cerr << "counter : " << counter << std::endl;
-			}
+			}			
 		}
 		return counter / (double)number_of_permutations;
 	}
author	pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb>	2016-12-02 09:56:24 +0000
committer	pdlotko <pdlotko@636b058d-ea47-450e-bf9e-a15bfbe3eedb>	2016-12-02 09:56:24 +0000
commit	d76207c6fab90c7a55a5851a9da03e617ad66883 (patch)
tree	aea2e0ff70c26cedb99f5d29c2d21c8642b8958a
parent	05c4c86f272bfa4f50a512f3db112557e81eb30d (diff)