src/python/gudhi/point_cloud/dtm.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179

# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
# Author(s):       Marc Glisse
#
# Copyright (C) 2020 Inria
#
# Modification(s):
#   - YYYY/MM Author: Description of the modification

from .knn import KNearestNeighbors
import numpy as np

__author__ = "Marc Glisse"
__copyright__ = "Copyright (C) 2020 Inria"
__license__ = "MIT"


class DistanceToMeasure:
    """
    Class to compute the distance to the empirical measure defined by a point set, as introduced in :cite:`dtm`.
    """

    def __init__(self, k, q=2, **kwargs):
        """
        Args:
            k (int): number of neighbors (possibly including the point itself).
            q (float): order used to compute the distance to measure. Defaults to 2.
            kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
                metric="neighbors" means that :func:`transform` expects an array with the distances
                to the k nearest neighbors.
        """
        self.k = k
        self.q = q
        self.params = kwargs

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def fit(self, X, y=None):
        """
        Args:
            X (numpy.array): coordinates for mass points.
        """
        if self.params.setdefault("metric", "euclidean") != "neighbors":
            self.knn = KNearestNeighbors(
                self.k, return_index=False, return_distance=True, sort_results=False, **self.params
            )
            self.knn.fit(X)
        return self

    def transform(self, X):
        """
        Args:
            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
                or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
                than k columns, the remaining ones are ignored).

        Returns:
            numpy.array: a 1-d array with, for each point of X, its distance to the measure defined
            by the argument of :func:`fit`.
        """
        if self.params["metric"] == "neighbors":
            distances = X[:, : self.k]
        else:
            distances = self.knn.transform(X)
        distances = distances ** self.q
        dtm = distances.sum(-1) / self.k
        dtm = dtm ** (1.0 / self.q)
        # We compute too many powers, 1/p in knn then q in dtm, 1/q in dtm then q or some log in the caller.
        # Add option to skip the final root?
        return dtm


class DTMDensity:
    """
    Density estimator based on the distance to the empirical measure defined by a point set, as defined
    in :cite:`dtmdensity`. Note that this implementation only renormalizes when asked, and the renormalization
    only works for a Euclidean metric, so in other cases the total measure may not be 1.

    .. note:: When the dimension is high, using it as an exponent can quickly lead to under- or overflows.
        We recommend using a small fixed value instead in those cases, even if it won't have the same nice
        theoretical properties as the dimension.
    """

    def __init__(self, k=None, weights=None, q=None, dim=None, normalize=False, n_samples=None, **kwargs):
        """
        Args:
            k (int): number of neighbors (possibly including the point itself). Optional if it can be guessed
                from weights or metric="neighbors".
            weights (numpy.array): weights of each of the k neighbors, optional. They are supposed to sum to 1.
            q (float): order used to compute the distance to measure. Defaults to dim.
            dim (float): final exponent representing the dimension. Defaults to the dimension, and must be specified
                when the dimension cannot be read from the input (metric is "neighbors" or "precomputed").
            normalize (bool): normalize the density so it corresponds to a probability measure on ℝᵈ.
                Only available for the Euclidean metric, defaults to False.
            n_samples (int): number of sample points used for fitting. Only needed if `normalize` is True and
                metric is "neighbors".
            kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that
                metric="neighbors" means that :func:`transform` expects an array with the distances to
                the k nearest neighbors.
        """
        if weights is None:
            self.k = k
            if k is None:
                assert kwargs.get("metric") == "neighbors", 'Must specify k or weights, unless metric is "neighbors"'
                self.weights = None
            else:
                self.weights = np.full(k, 1.0 / k)
        else:
            self.weights = weights
            self.k = len(weights)
            assert k is None or k == self.k, "k differs from the length of weights"
        self.q = q
        self.dim = dim
        self.params = kwargs
        self.normalize = normalize
        self.n_samples = n_samples

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def fit(self, X, y=None):
        """
        Args:
            X (numpy.array): coordinates for mass points.
        """
        if self.params.setdefault("metric", "euclidean") != "neighbors":
            self.knn = KNearestNeighbors(
                self.k, return_index=False, return_distance=True, sort_results=False, **self.params
            )
            self.knn.fit(X)
            if self.params["metric"] != "precomputed":
                self.n_samples = len(X)
        return self

    def transform(self, X):
        """
        Args:
            X (numpy.array): coordinates for query points, or distance matrix if metric is "precomputed",
                or distances to the k nearest neighbors if metric is "neighbors" (if the array has more
                than k columns, the remaining ones are ignored).
        """
        q = self.q
        dim = self.dim
        if dim is None:
            assert self.params["metric"] not in {
                "neighbors",
                "precomputed",
            }, "dim not specified and cannot guess the dimension"
            dim = len(X[0])
        if q is None:
            q = dim
        k = self.k
        weights = self.weights
        if self.params["metric"] == "neighbors":
            distances = np.asarray(X)
            if weights is None:
                k = distances.shape[1]
                weights = np.full(k, 1.0 / k)
            else:
                distances = distances[:, :k]
        else:
            distances = self.knn.transform(X)
        distances = distances ** q
        dtm = (distances * weights).sum(-1)
        if self.normalize:
            dtm /= (np.arange(1, k + 1) ** (q / dim) * weights).sum()
        density = dtm ** (-dim / q)
        if self.normalize:
            import math

            if self.params["metric"] == "precomputed":
                self.n_samples = len(X[0])
            # Volume of d-ball
            Vd = math.pi ** (dim / 2) / math.gamma(dim / 2 + 1)
            density /= self.n_samples * Vd
        return density
        # We compute too many powers, 1/p in knn then q in dtm, d/q in dtm then whatever in the caller.
        # Add option to skip the final root?