From d86676e247bfa6f29b625a9a5752bf2a2fab438f Mon Sep 17 00:00:00 2001 From: Marc Glisse Date: Mon, 11 May 2020 21:31:34 +0200 Subject: Normalize the density if asked --- src/python/gudhi/point_cloud/dtm.py | 38 +++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'src/python/gudhi/point_cloud') diff --git a/src/python/gudhi/point_cloud/dtm.py b/src/python/gudhi/point_cloud/dtm.py index ef6eef05..2f30908d 100644 --- a/src/python/gudhi/point_cloud/dtm.py +++ b/src/python/gudhi/point_cloud/dtm.py @@ -74,19 +74,27 @@ class DistanceToMeasure: class DTMDensity: """ Density estimator based on the distance to the empirical measure defined by a point set, as defined - in :cite:`dtmdensity`. Note that this implementation does not renormalize so the total measure is not 1, - see the reference for suitable normalization factors in the Euclidean case. + in :cite:`dtmdensity`. Note that this implementation only renormalizes when asked, and the renormalization + only works for a Euclidean metric, so in other cases the total measure may not be 1. """ - def __init__(self, k=None, weights=None, q=None, dim=None, **kwargs): + def __init__(self, k=None, weights=None, q=None, dim=None, normalize=False, n_samples=None, **kwargs): """ Args: k (int): number of neighbors (possibly including the point itself). - weights (numpy.array): weights of each of the k neighbors, optional. + weights (numpy.array): weights of each of the k neighbors, optional. They are supposed to sum to 1. q (float): order used to compute the distance to measure. Defaults to dim. dim (float): final exponent representing the dimension. Defaults to the dimension, and must be specified when the dimension cannot be read from the input (metric="neighbors" or metric="precomputed"). - kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNN`, except that metric="neighbors" means that + + .. note:: when the dimension is high, using it as an exponent can quickly lead to under- or overflows. + We recommend using a small fixed value instead in those cases, even if it won't have the same nice + theoretical properties as the dimension. + normalize (bool): normalize the density so it corresponds to a probability measure on ℝᵈ. + Only available for the Euclidean metric, defaults to False. + n_samples (int): number of sample points used for fitting. Only needed if `normalize` is True and + metric is "neighbors". + kwargs: same parameters as :class:`~gudhi.point_cloud.knn.KNearestNeighbors`, except that metric="neighbors" means that :func:`transform` expects an array with the distances to the k nearest neighbors. """ if weights is None: @@ -100,6 +108,8 @@ class DTMDensity: self.q = q self.dim = dim self.params = kwargs + self.normalize=normalize + self.n_samples=n_samples def fit_transform(self, X, y=None): return self.fit(X).transform(X) @@ -110,8 +120,10 @@ class DTMDensity: X (numpy.array): coordinates for mass points. """ if self.params.setdefault("metric", "euclidean") != "neighbors": - self.knn = KNN(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) + self.knn = KNearestNeighbors(self.k, return_index=False, return_distance=True, sort_results=False, **self.params) self.knn.fit(X) + if self.params["metric"] != "precomputed": + self.n_samples = len(X) return self def transform(self, X): @@ -136,7 +148,17 @@ class DTMDensity: else: distances = self.knn.transform(X) distances = distances ** q - dtm = (distances * weights).sum(-1) - return dtm ** (-dim / q) + dtm = (distances * self.weights).sum(-1) + if self.normalize: + dtm /= (np.arange(1, self.k + 1) ** (q / dim) * self.weights).sum() + density = dtm ** (-dim / q) + if self.normalize: + import math + if self.params["metric"] == "precomputed": + self.n_samples = len(X[0]) + # Volume of d-ball + Vd = math.pi ** (dim / 2) / math.gamma(dim / 2 + 1) + density /= self.n_samples * Vd + return density # We compute too many powers, 1/p in knn then q in dtm, d/q in dtm then whatever in the caller. # Add option to skip the final root? -- cgit v1.2.3