summaryrefslogtreecommitdiff
path: root/src/Gudhi_stat/include/gudhi/bootstrap.h
blob: 83edb2e1f8230efc4b7ccd82b6dd714a385957a7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*    This file is part of the Gudhi Library. The Gudhi library
 *    (Geometric Understanding in Higher Dimensions) is a generic C++
 *    library for computational topology.
 *
 *    Author(s):       Pawel Dlotko
 *
 *    Copyright (C) 2015  INRIA (France)
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef BOOTSTRAP_H
#define BOOTSTRAP_H


#ifdef GUDHI_USE_TBB
#include <tbb/parallel_sort.h>
#endif

#include <vector>
#include <algorithm>
#include <omp.h>
#include <random>
#include <ctime>

namespace Gudhi
{
namespace Gudhi_stat
{

/**
* This is a generic function to perform bootstrap.
* In this function we assume that there is a class to compute characteristic of collection of points (PointCloudCharacteristics) and that it stores coordinates of all points. It allows to compute the characteristic
* of the whole point cloud (by using CharacteristicFunction) or of it proper subset of the whole point cloud (given the list of numers of points in the subset). 
* Both functionalities will be used in this implementation.
* The characteristic of point cloud, may be the poit cloud itself, its persistence diagram in a fixed dimension, or anything else. We only assume that space of points characteristics is a metric space
* and that we can compute a distance between two characteristics of collections of points by using DistanceBetweenPointsCharacteristics function. 
**/


template < typename PointCloudCharacteristics , typename CharacteristicFunction , typename DistanceBetweenPointsCharacteristics >
double bootstrap( size_t number_of_points , CharacteristicFunction f , DistanceBetweenPointsCharacteristics distance , size_t number_of_repetitions , size_t size_of_subsample , double quantile = 0.95 )
{
	bool dbg = false;
	
	if ( size_of_subsample >= number_of_points )
	{
		std::cerr << "Size of subsample is greater or equal to the number of points. The bootstrap procedure do not make sense in this case. \n";
		return 0;
	}
	
	//initialization of a random number generator:
    std::srand ( unsigned ( std::time(0) ) );
	
	//we will shuffle the vector of numbers 0,1,2,...,points.size()-1 in order to pick a subset of a size size_of_subsample
	std::vector< size_t > numbers_to_sample( number_of_points );	
	for ( size_t i = 0 ; i != number_of_points ; ++i )
	{			
		numbers_to_sample[i] = i;
	}	
	
	//now we compute the characteristic od all the points:	
	PointCloudCharacteristics characteristic_of_all_points = f( numbers_to_sample );
	
	//vector to keep the distances between characteristic_of_points and characteristic_of_subsample:
	std::vector< double > vector_of_distances( number_of_repetitions , 0 );
	


	#ifdef GUDHI_USE_TBB
    tbb::parallel_for ( tbb::blocked_range<size_t>(0, number_of_repetitions), [&](const tbb::blocked_range<size_t>& range) 
    {
    for  ( size_t it_no = range.begin() ;  it_no != range.end() ; ++it_no )
	#else
	for ( size_t it_no = 0 ;  it_no < number_of_repetitions ; ++it_no )
	#endif	
	{
		if ( dbg )
		{
			std::cout << "Still : " << number_of_repetitions-it_no << " tests to go. \n The subsampled vector consist of points number : ";
		}
		//do a random shuffle of vector_of_characteristics_of_poits
		std::random_shuffle( numbers_to_sample.begin() , numbers_to_sample.end() );
		
		//construct a vector< PointType > of a size size_of_subsample:
		std::vector< size_t > subsampled_points;
		subsampled_points.reserve( size_of_subsample );
		for ( size_t i = 0 ; i != size_of_subsample ; ++i )
		{
			subsampled_points.push_back( numbers_to_sample[i] );
			if ( dbg )std::cout << numbers_to_sample[i] << " , ";
		}
		
		
		//now we can compute characteristic of subsampled_points:
		PointCloudCharacteristics characteristic_of_subsampled_points = f( subsampled_points );
		if ( dbg )std::cout << std::endl << "Characteristic of subsampled points computed.\n";					
		
		//and now we compute distance between characteristic_of_points and characteristic_of_subsample. Note that subsampled points go first, and this is neded, since sometimes all points are not needed.
		double dist = distance( characteristic_of_subsampled_points , characteristic_of_all_points );
		
		if ( dbg ) std::cout << "The distance between characteristic of all points and the characteristic of subsample is : " << dist << std::endl;
		
		vector_of_distances[it_no] = dist;		
	}
	#ifdef GUDHI_USE_TBB
    }
    );
	#endif
		
	size_t position_of_quantile = floor(quantile*vector_of_distances.size());
	if ( position_of_quantile ) --position_of_quantile;
	if ( dbg )
	{
		std::cout << "position_of_quantile : " << position_of_quantile << ", and here is the array : " << std::endl;
		for ( size_t i = 0 ; i != vector_of_distances.size() ; ++i )
		{
			std::cout << vector_of_distances[i] << std::endl;
		}
		std::cout << std::endl;
	}
	  
	//now we need to sort the vector_of_distances and find the quantile:
	std::nth_element (vector_of_distances.begin(), vector_of_distances.begin()+position_of_quantile, vector_of_distances.end());
	
	if ( dbg )std::cout << "Result : " << vector_of_distances[ position_of_quantile ] << std::endl;
	
	return vector_of_distances[ position_of_quantile ];
	
}//bootstrap



}//namespace Gudhi_stat
}//namespace Gudhi

#endif