src/Subsampling/include/gudhi/choose_n_farthest_points.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

/*    This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
 *    See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
 *    Author(s):       Siargey Kachanovich
 *
 *    Copyright (C) 2016 Inria
 *
 *    Modification(s):
 *      - YYYY/MM Author: Description of the modification
 */

#ifndef CHOOSE_N_FARTHEST_POINTS_H_
#define CHOOSE_N_FARTHEST_POINTS_H_

#include <boost/range.hpp>

#include <gudhi/Null_output_iterator.h>

#include <iterator>
#include <vector>
#include <random>
#include <limits>  // for numeric_limits<>

namespace Gudhi {

namespace subsampling {

/**
 *  \ingroup subsampling
 */
enum : std::size_t {
/**
 *  Argument for `choose_n_farthest_points` to indicate that the starting point should be picked randomly.
 */
  random_starting_point = std::size_t(-1)
};

/** 
 *  \ingroup subsampling
 *  \brief Subsample by a greedy strategy of iteratively adding the farthest point from the
 *  current chosen point set to the subsampling. 
 *  \details
 *  The iteration starts with the landmark `starting point` or, if `starting point==random_starting_point`,
 *  with a random landmark.
 *  It chooses `final_size` points from a random access range
 *  `input_pts` (or the number of input points if `final_size` is larger)
 *  and outputs them in the output iterator `output_it`. It also
 *  outputs the distance from each of those points to the set of previous
 *  points in `dist_it`.
 *  \tparam Distance must provide an operator() that takes 2 points (value type of the range)
 *  and returns their distance (or some more general proximity measure) as a `double`.
 *  \tparam Point_range Random access range of points.
 *  \tparam PointOutputIterator Output iterator whose value type is the point type.
 *  \tparam DistanceOutputIterator Output iterator for distances.
 * @param[in] dist A distance function.
 * @param[in] input_pts The input points.
 * @param[in] final_size The size of the subsample to compute.
 * @param[in] starting_point The seed in the farthest point algorithm.
 * @param[out] output_it The output iterator for points.
 * @param[out] dist_it The optional output iterator for distances.
 *
 * \warning Older versions of this function took a CGAL kernel as argument. Users need to replace `k` with
 * `k.squared_distance_d_object()` in the first argument of every call to `choose_n_farthest_points`.
 *  
 */
template < typename Distance,
typename Point_range,
typename PointOutputIterator,
typename DistanceOutputIterator = Null_output_iterator>
void choose_n_farthest_points(Distance dist,
                              Point_range const &input_pts,
                              std::size_t final_size,
                              std::size_t starting_point,
                              PointOutputIterator output_it,
                              DistanceOutputIterator dist_it = {}) {
  std::size_t nb_points = boost::size(input_pts);
  if (final_size > nb_points)
    final_size = nb_points;

  // Tests to the limit
  if (final_size < 1)
    return;

  if (starting_point == random_starting_point) {
    // Choose randomly the first landmark
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<std::size_t> dis(0, nb_points - 1);
    starting_point = dis(gen);
  }

  // FIXME: don't hard-code the type as double. For Epeck_d, we also want to handle types that do not have an infinity.
  static_assert(std::numeric_limits<double>::has_infinity, "the number type needs to support infinity()");

  *output_it++ = input_pts[starting_point];
  *dist_it++ = std::numeric_limits<double>::infinity();
  if (final_size == 1) return;

  std::vector<std::size_t> points(nb_points);  // map from remaining points to indexes in input_pts
  std::vector< double > dist_to_L(nb_points);  // vector of current distances to L from points
  for(std::size_t i = 0; i < nb_points; ++i) {
    points[i] = i;
    dist_to_L[i] = dist(input_pts[i], input_pts[starting_point]);
  }
  // The indirection through points makes the program a bit slower. Some alternatives:
  // - the original code never removed points and counted on them not
  //   reappearing because of a self-distance of 0. This causes unnecessary
  //   computations when final_size is large. It also causes trouble if there are
  //   input points at distance 0 from each other.
  // - copy input_pts and update the local copy when removing points.

  std::size_t curr_max_w = starting_point;

  for (std::size_t current_number_of_landmarks = 1; current_number_of_landmarks != final_size; current_number_of_landmarks++) {
    std::size_t latest_landmark = points[curr_max_w];
    // To remove the latest landmark at index curr_max_w, replace it
    // with the last point and reduce the length of the vector.
    std::size_t last = points.size() - 1;
    if (curr_max_w != last) {
      points[curr_max_w] = points[last];
      dist_to_L[curr_max_w] = dist_to_L[last];
    }
    points.pop_back();

    // Update distances to L.
    std::size_t i = 0;
    for (auto p : points) {
      double curr_dist = dist(input_pts[p], input_pts[latest_landmark]);
      if (curr_dist < dist_to_L[i])
        dist_to_L[i] = curr_dist;
      ++i;
    }
    // choose the next landmark
    curr_max_w = 0;
    double curr_max_dist = dist_to_L[curr_max_w];  // used for defining the furthest point from L
    for (i = 1; i < points.size(); i++)
      if (dist_to_L[i] > curr_max_dist) {
        curr_max_dist = dist_to_L[i];
        curr_max_w = i;
      }
    *output_it++ = input_pts[points[curr_max_w]];
    *dist_it++ = dist_to_L[curr_max_w];
  }
}

}  // namespace subsampling

}  // namespace Gudhi

#endif  // CHOOSE_N_FARTHEST_POINTS_H_