reviews

author: mathieu <mathieu.carriere3@gmail.com> 2019-09-09 16:17:28 -0400
committer: mathieu <mathieu.carriere3@gmail.com> 2019-09-09 16:17:28 -0400
commit: 5e8144e8ddb4a39f9f4e60a9fafb5cc57f96acac (patch)
tree: 81922ed73c09df253ddb78440d8eb5be298961c3 /src
parent: c18017f78779239cf17dc1a9b0d00a9b8f075a2d (diff)
4 files changed, 149 insertions, 115 deletions
diff --git a/src/cython/sktda/kernel_methods.py b/src/cython/sktda/kernel_methods.py
index 6e3f6d5e..d90bf164 100644
--- a/src/cython/sktda/kernel_methods.py
+++ b/src/cython/sktda/kernel_methods.py
@@ -101,7 +101,7 @@ class PersistenceWeightedGaussianKernel(BaseEstimator, TransformerMixin):
                         W = np.matmul(self.ws_[i][:,np.newaxis], self.ws_[j][np.newaxis,:])
                         E = (1./(np.sqrt(2*np.pi)*self.bandwidth)) * np.exp(-np.square(pairwise_distances(self.diagrams_[i], self.diagrams_[j]))/(2*np.square(self.bandwidth)))
                         Xfit[i,j] = np.sum(np.multiply(W, E))
-                        Xfit[j,i] = X[i,j]
+                        Xfit[j,i] = Xfit[i,j]
         else:
             ws = [ np.array([self.weight(Xp[i][j,:]) for j in range(Xp[i].shape[0])]) for i in range(len(Xp)) ]
             if self.kernel_approx is not None:
diff --git a/src/cython/sktda/metrics.py b/src/cython/sktda/metrics.py
index 5dc219cd..18db432a 100644
--- a/src/cython/sktda/metrics.py
+++ b/src/cython/sktda/metrics.py
@@ -145,7 +145,7 @@ class BottleneckDistance(BaseEstimator, TransformerMixin):
 
 class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
     """
-    This is a class for computing the persistence Fisher distance matrix from a list of persistence diagrams. The persistence Fisher distance is obtained by computing the original Fisher distance between the probability distributions associated to the persistence diagrams given by convolving them with a Gaussian kernel. See papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details. 
+    This is a class for computing the persistence Fisher distance matrix from a list of persistence diagrams. The persistence Fisher distance is obtained by computing the original Fisher distance between the probability distributions associated to the persistence diagrams given by convolving them with a Gaussian kernel. See http://papers.nips.cc/paper/8205-persistence-fisher-kernel-a-riemannian-manifold-kernel-for-persistence-diagrams for more details. 
     """
     def __init__(self, bandwidth=1., kernel_approx=None):
         """
@@ -190,8 +190,12 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
                     if self.kernel_approx is not None:
                         Z = np.concatenate([self.approx_[i], self.approx_diagonal_[i], self.approx_[j], self.approx_diagonal_[j]], axis=0)
                         U, V = np.sum(np.concatenate([self.approx_[i], self.approx_diagonal_[j]], axis=0), axis=0), np.sum(np.concatenate([self.approx_[j], self.approx_diagonal_[i]], axis=0), axis=0) 
-                        vectori, vectorj = np.matmul(Z, U.T), np.matmul(Z, V.T)
-                        vectori, vectorj = vectori/np.sum(vectori), vectorj/np.sum(vectorj)
+                        vectori, vectorj = np.abs(np.matmul(Z, U.T)), np.abs(np.matmul(Z, V.T))
+                        vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+                        if vectori_sum != 0:
+                            vectori = vectori/vectori_sum
+                        if vectorj_sum != 0:
+                            vectorj = vectorj/vectorj_sum
                         Xfit[i,j] = np.arccos(  min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.)  )
                         Xfit[j,i] = Xfit[i,j]
                     else:
@@ -199,7 +203,11 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
                         U, V = np.concatenate([self.diagrams_[i], self.diagonal_projections_[j]], axis=0), np.concatenate([self.diagrams_[j], self.diagonal_projections_[i]], axis=0) 
                         vectori = np.sum(np.exp(-np.square(pairwise_distances(Z,U))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
                         vectorj = np.sum(np.exp(-np.square(pairwise_distances(Z,V))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
-                        vectori, vectorj = vectori/np.sum(vectori), vectorj/np.sum(vectorj)
+                        vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+                        if vectori_sum != 0:
+                            vectori = vectori/vectori_sum
+                        if vectorj_sum != 0:
+                            vectorj = vectorj/vectorj_sum
                         Xfit[i,j] = np.arccos(  min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.)  )
                         Xfit[j,i] = Xfit[i,j]
         else:
@@ -213,20 +221,22 @@ class PersistenceFisherDistance(BaseEstimator, TransformerMixin):
                     if self.kernel_approx is not None:
                         Z = np.concatenate([approx[i], approx_diagonal[i], self.approx_[j], self.approx_diagonal_[j]], axis=0)
                         U, V = np.sum(np.concatenate([approx[i], self.approx_diagonal_[j]], axis=0), axis=0), np.sum(np.concatenate([self.approx_[j], approx_diagonal[i]], axis=0), axis=0) 
-                        vectori, vectorj = np.matmul(Z, U.T), np.matmul(Z, V.T)
-                        if np.sum(vectori) != 0:
-                            vectori = vectori/np.sum(vectori)
-                        if np.sum(vectorj) != 0:
-                            vectorj = vectorj/np.sum(vectorj)
+                        vectori, vectorj = np.abs(np.matmul(Z, U.T)), np.abs(np.matmul(Z, V.T))
+                        vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+                        if vectori_sum != 0:
+                            vectori = vectori/vectori_sum
+                        if vectorj_sum != 0:
+                            vectorj = vectorj/vectorj_sum
                         Xfit[i,j] = np.arccos(  min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.)  )
                     else:
                         Z = np.concatenate([X[i], diagonal_projections[i], self.diagrams_[j], self.diagonal_projections_[j]], axis=0)
                         U, V = np.concatenate([X[i], self.diagonal_projections_[j]], axis=0), np.concatenate([self.diagrams_[j], diagonal_projections[i]], axis=0) 
                         vectori = np.sum(np.exp(-np.square(pairwise_distances(Z,U))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
                         vectorj = np.sum(np.exp(-np.square(pairwise_distances(Z,V))/(2 * np.square(self.bandwidth)))/(self.bandwidth * np.sqrt(2*np.pi)), axis=1)
-                        if np.sum(vectori) != 0:
-                            vectori = vectori/np.sum(vectori)
-                        if np.sum(vectorj) != 0:
-                            vectorj = vectorj/np.sum(vectorj)
+                        vectori_sum, vectorj_sum = np.sum(vectori), np.sum(vectorj)
+                        if vectori_sum != 0:
+                            vectori = vectori/vectori_sum
+                        if vectorj_sum != 0:
+                            vectorj = vectorj/vectorj_sum
                         Xfit[i,j] = np.arccos(  min(np.dot(np.sqrt(vectori), np.sqrt(vectorj)), 1.)  )
         return Xfit
diff --git a/src/cython/sktda/preprocessing.py b/src/cython/sktda/preprocessing.py
index b2bc68f6..512b02f3 100644
--- a/src/cython/sktda/preprocessing.py
+++ b/src/cython/sktda/preprocessing.py
@@ -13,7 +13,7 @@ from sklearn.preprocessing import StandardScaler
 
 class BirthPersistenceTransform(BaseEstimator, TransformerMixin):
     """
-    This is a class for the affine transformation (x,y) -> (x,y-x) to be applied on persistence diagrams. It is a particular scaler for persistence diagram that can be given as input for the DiagramPreprocessor class.
+    This is a class for the affine transformation (x,y) -> (x,y-x) to be applied on persistence diagrams.
     """
     def __init__(self):
         """
@@ -26,8 +26,8 @@ class BirthPersistenceTransform(BaseEstimator, TransformerMixin):
         Fit the BirthPersistenceTransform class on a list of persistence diagrams (this function actually does nothing but is useful when BirthPersistenceTransform is included in a scikit-learn Pipeline).
 
         Parameters:
-            X (n x 2 numpy array): input persistence diagram.
-            y (n x 1 array): persistence diagram label (unused).
+            X (n x 2 numpy array): input persistence diagrams.
+            y (n x 1 array): persistence diagram labels (unused).
         """
         return self
 
@@ -36,16 +36,56 @@ class BirthPersistenceTransform(BaseEstimator, TransformerMixin):
         Apply the BirthPersistenceTransform function on the persistence diagrams.
 
         Parameters:
-            X (n x 2 numpy array): input persistence diagram.
+            X (list of n x 2 numpy array): input persistence diagrams.
 
         Returns:
-            Xfit (n x 2 numpy array): transformed persistence diagram.
+            Xfit (list of n x 2 numpy array): transformed persistence diagrams.
         """
-        Xfit = np.matmul(X, np.array([[1., -1.],[0., 1.]]))
+        Xfit = []
+        for diag in X:
+            new_diag = np.empty(diag.shape)
+            np.copyto(new_diag, diag)
+            new_diag[:,1] = new_diag[:,1] - new_diag[:,0]
+            Xfit.append(new_diag)
         return Xfit
 
+class Clamping(BaseEstimator, TransformerMixin):
+    """
+    This is a class for clamping values. It can be used as a parameter for the DiagramScaler class, for instance if you want to clamp abscissae or ordinates of persistence diagrams.
+    """
+    def __init__(self, limit=np.inf):
+        """
+        Constructor for the Clamping class.
+
+        Attributes:
+            limit (double): clamping value (default np.inf).
+        """
+        self.limit = limit
+
+    def fit(self, X, y=None):
+        """
+        Fit the Clamping class on a list of list of values (this function actually does nothing but is useful when Clamping is included in a scikit-learn Pipeline).
+
+        Parameters:
+            X (list of numpy arrays of size n): input values.
+            y (n x 1 array): value labels (unused).
+        """
+        return self
+
+    def transform(self, X):
+        """
+        Clamp each list of values individually.
 
-class DiagramPreprocessor(BaseEstimator, TransformerMixin):
+        Parameters:
+            X (list of numpy arrays of size n): input list of list of values.
+
+        Returns:
+            Xfit (list of numpy arrays of size n): output list of list of values.
+        """
+        Xfit = [np.where(L >= self.limit, self.limit * np.ones(L.shape), L) for L in X]
+        return Xfit
+
+class DiagramScaler(BaseEstimator, TransformerMixin):
     """
     This is a class for preprocessing persistence diagrams with a given list of scalers, such as those included in scikit-learn.
     """
@@ -116,6 +156,7 @@ class Padding(BaseEstimator, TransformerMixin):
             X (list of n x 2 or n x 1 numpy arrays): input persistence diagrams.
             y (n x 1 array): persistence diagram labels (unused).
         """
+        self.max_pts = max([len(diag) for diag in X])
         return self
 
     def transform(self, X):
@@ -130,12 +171,9 @@ class Padding(BaseEstimator, TransformerMixin):
         """
         if self.use:
             Xfit, num_diag = [], len(X)
-            max_card = max([len(diag) for diag in X])
             for diag in X:
-                [num_pts, dim] = diag.shape
-                diag_pad = np.zeros([max_card, dim+1])
-                diag_pad[:num_pts,:dim] = diag
-                diag_pad[:num_pts, dim] = np.ones(num_pts)
+                diag_pad = np.pad(diag, ((0,max(0, self.max_pts - diag.shape[0])), (0,1)), "constant", constant_values=((0,0),(0,0)))
+                diag_pad[:diag.shape[0],2] = np.ones(diag.shape[0])
                 Xfit.append(diag_pad)                    
         else:
             Xfit = X
@@ -143,9 +181,9 @@ class Padding(BaseEstimator, TransformerMixin):
 
 class ProminentPoints(BaseEstimator, TransformerMixin):
     """
-    This is a class for removing points that are close or far from the diagonal in persistence diagrams.
+    This is a class for removing points that are close or far from the diagonal in persistence diagrams.  If persistence diagrams are n x 2 numpy arrays (i.e. persistence diagrams with ordinary features), points are ordered and thresholded by distance-to-diagonal. If persistence diagrams are n x 1 numpy arrays (i.e. persistence diagrams with essential features), points are not ordered and thresholded by first coordinate.
     """
-    def __init__(self, use=False, num_pts=10, threshold=-1, location="upper", point_type="finite"):
+    def __init__(self, use=False, num_pts=10, threshold=-1, location="upper"):
         """
         Constructor for the ProminentPoints class.
      
@@ -154,13 +192,11 @@ class ProminentPoints(BaseEstimator, TransformerMixin):
             location (string): either "upper" or "lower" (default "upper"). Whether to keep the points that are far away ("upper") or close ("lower") to the diagonal.
             num_pts (int): cardinality threshold (default 10). If location == "upper", keep the top **num_pts** points that are the farthest away from the diagonal. If location == "lower", keep the top **num_pts** points that are the closest to the diagonal. 
             threshold (double): distance-to-diagonal threshold (default -1). If location == "upper", keep the points that are at least at a distance **threshold** from the diagonal. If location == "lower", keep the points that are at most at a distance **threshold** from the diagonal. 
-            point_type (string): either "finite" if persistence diagrams are n x 2 numpy arrays, or "essential" if persistence diagrams are n x 1 numpy arrays (default "finite"). If "finite", points are ordered and thresholded by distance-to-diagonal. If "essential", points are ordered and thresholded by first coordinate.
         """
         self.num_pts    = num_pts
         self.threshold  = threshold
         self.use        = use
         self.location   = location
-        self.point_type = point_type
 
     def fit(self, X, y=None):
         """
@@ -174,7 +210,7 @@ class ProminentPoints(BaseEstimator, TransformerMixin):
 
     def transform(self, X):
         """
-        If location == "upper", first select the top **num_pts** points that are the farthest away from the diagonal, then select and return from these points the ones that are at least at distance **threshold** from the diagonal for each persistence diagram individually. If location == "lower", first select the top **num_pts** points that are the closest to the diagonal, then select and return from these points the ones that are at most at distance **threshold** from the diagonal for each persistence diagram individually. If point_type == "essential", do the same with first coordinate instead of distance-to-diagonal.
+        If location == "upper", first select the top **num_pts** points that are the farthest away from the diagonal, then select and return from these points the ones that are at least at distance **threshold** from the diagonal for each persistence diagram individually. If location == "lower", first select the top **num_pts** points that are the closest to the diagonal, then select and return from these points the ones that are at most at distance **threshold** from the diagonal for each persistence diagram individually.
 
         Parameters:
             X (list of n x 2 or n x 1 numpy arrays): input persistence diagrams.
@@ -186,16 +222,16 @@ class ProminentPoints(BaseEstimator, TransformerMixin):
             Xfit, num_diag = [], len(X)
             for i in range(num_diag):
                 diag = X[i]
-                if self.point_type == "finite":
+                if diag.shape[1] >= 2:
                     if diag.shape[0] > 0:
-                        pers       = np.abs(np.matmul(diag[:,:2], [-1., 1.]))
+                        pers       = np.abs(diag[:,1] - diag[:,0])
                         idx_thresh = pers >= self.threshold
-                        thresh_diag, thresh_pers  = diag[idx_thresh.flatten()], pers[idx_thresh.flatten()]
+                        thresh_diag, thresh_pers  = diag[idx_thresh], pers[idx_thresh]
                         sort_index  = np.flip(np.argsort(thresh_pers, axis=None), 0)
                         if self.location == "upper":
                             new_diag = thresh_diag[sort_index[:min(self.num_pts, thresh_diag.shape[0])],:]
                         if self.location == "lower":
-                            new_diag = np.concatenate( [ thresh_diag[sort_index[min(self.num_pts, thresh_diag.shape[0]):],:], diag[~idx_thresh.flatten()] ], axis=0)
+                            new_diag = np.concatenate( [ thresh_diag[sort_index[min(self.num_pts, thresh_diag.shape[0]):],:], diag[~idx_thresh] ], axis=0)
                     else:
                         new_diag = diag
 
@@ -203,11 +239,11 @@ class ProminentPoints(BaseEstimator, TransformerMixin):
                     if diag.shape[0] > 0:
                         birth      = diag[:,:1]
                         idx_thresh = birth >= self.threshold
-                        thresh_diag, thresh_birth  = diag[idx_thresh.flatten()], birth[idx_thresh.flatten()]
+                        thresh_diag, thresh_birth  = diag[idx_thresh], birth[idx_thresh]
                         if self.location == "upper":
                             new_diag = thresh_diag[:min(self.num_pts, thresh_diag.shape[0]),:]
                         if self.location == "lower":
-                            new_diag = np.concatenate( [ thresh_diag[min(self.num_pts, thresh_diag.shape[0]):,:], diag[~idx_thresh.flatten()] ], axis=0)
+                            new_diag = np.concatenate( [ thresh_diag[min(self.num_pts, thresh_diag.shape[0]):,:], diag[~idx_thresh] ], axis=0)
                     else:
                         new_diag = diag
 
@@ -254,21 +290,9 @@ class DiagramSelector(BaseEstimator, TransformerMixin):
         if self.use:
             Xfit, num_diag = [], len(X)
             if self.point_type == "finite":
-                for i in range(num_diag):
-                    diag = X[i]
-                    if diag.shape[0] != 0:
-                        idx_fin = diag[:,1] != self.limit
-                        Xfit.append(diag[idx_fin,:])
-                    else:
-                        Xfit.append(diag)
-            if self.point_type == "essential":
-                for i in range(num_diag):
-                    diag = X[i]
-                    if diag.shape[0] != 0:
-                        idx_ess = diag[:,1] == self.limit
-                        Xfit.append(np.delete(diag,1,1)[idx_ess,:])
-                    else:
-                        Xfit.append(np.delete(diag,1,1))
+                Xfit = [ diag[diag[:,1] < self.limit] if diag.shape[0] != 0 else diag for diag in X]
+            else:
+                Xfit = [ diag[diag[:,1] == self.limit, 0:1] if diag.shape[0] != 0 else diag for diag in X]
         else:
             Xfit = X
         return Xfit
diff --git a/src/cython/sktda/vector_methods.py b/src/cython/sktda/vector_methods.py
index 99c2f420..3862f815 100644
--- a/src/cython/sktda/vector_methods.py
+++ b/src/cython/sktda/vector_methods.py
@@ -8,7 +8,7 @@ from sklearn.base          import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
 from sklearn.neighbors     import DistanceMetric
 
-from .preprocessing import DiagramPreprocessor, BirthPersistenceTransform
+from .preprocessing import DiagramScaler, BirthPersistenceTransform
 
 #############################################
 # Finite Vectorization methods ##############
@@ -40,15 +40,15 @@ class PersistenceImage(BaseEstimator, TransformerMixin):
             y (n x 1 array): persistence diagram labels (unused).
         """
         if np.isnan(np.array(self.im_range)).any():
-            new_X = DiagramPreprocessor(use=True, scalers=[([0,1], BirthPersistenceTransform())]).fit_transform(X)
-            pre = DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(new_X,y)
-            [mx,my],[Mx,My] = pre.scalers[0][1].data_min_, pre.scalers[0][1].data_max_
+            new_X = BirthPersistenceTransform().fit_transform(X)
+            pre = DiagramScaler(use=True, scalers=[([0], MinMaxScaler()), ([1], MinMaxScaler())]).fit(new_X,y)
+            [mx,my],[Mx,My] = [pre.scalers[0][1].data_min_[0], pre.scalers[1][1].data_min_[0]], [pre.scalers[0][1].data_max_[0], pre.scalers[1][1].data_max_[0]]
             self.im_range = np.where(np.isnan(np.array(self.im_range)), np.array([mx, Mx, my, My]), np.array(self.im_range))
         return self
 
     def transform(self, X):
         """
-        Compute the persistence image for each persistence diagram individually and concatenate the results.
+        Compute the persistence image for each persistence diagram individually and store the results in a single numpy array.
 
         Parameters:
             X (list of n x 2 numpy arrays): input persistence diagrams.
@@ -57,13 +57,13 @@ class PersistenceImage(BaseEstimator, TransformerMixin):
             Xfit (numpy array with shape (number of diagrams) x (number of pixels = **resolution[0]** x **resolution[1]**)): output persistence images.
         """
         num_diag, Xfit = len(X), []
-        new_X = DiagramPreprocessor(use=True, scalers=[([0,1], BirthPersistenceTransform())]).fit_transform(X)
+        new_X = BirthPersistenceTransform().fit_transform(X)
 
         for i in range(num_diag):
 
             diagram, num_pts_in_diag = new_X[i], X[i].shape[0]
 
-            w = np.ones(num_pts_in_diag)
+            w = np.empty(num_pts_in_diag)
             for j in range(num_pts_in_diag):
                 w[j] = self.weight(diagram[j,:])
 
@@ -81,29 +81,29 @@ class Landscape(BaseEstimator, TransformerMixin):
     """
     This is a class for computing persistence landscapes from a list of persistence diagrams. A persistence landscape is a collection of 1D piecewise-linear functions computed from the rank function associated to the persistence diagram. These piecewise-linear functions are then sampled uniformly on a given range and the corresponding vectors of samples are concatenated and returned. See http://jmlr.org/papers/v16/bubenik15a.html for more details.
     """
-    def __init__(self, num_landscapes=5, resolution=100, ls_range=[np.nan, np.nan]):
+    def __init__(self, num_landscapes=5, resolution=100, sample_range=[np.nan, np.nan]):
         """
         Constructor for the Landscape class.
 
         Attributes:
             num_landscapes (int): number of piecewise-linear functions to output (default 5).
             resolution (int): number of sample for all piecewise-linear functions (default 100).
-            ls_range ([double, double]): minimum and maximum of all piecewise-linear function domains, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
+            sample_range ([double, double]): minimum and maximum of all piecewise-linear function domains, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
         """
-        self.num_landscapes, self.resolution, self.ls_range = num_landscapes, resolution, ls_range
+        self.num_landscapes, self.resolution, self.sample_range = num_landscapes, resolution, sample_range
 
     def fit(self, X, y=None):
         """
-        Fit the Landscape class on a list of persistence diagrams: if any of the values in **ls_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
+        Fit the Landscape class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
 
         Parameters:
             X (list of n x 2 numpy arrays): input persistence diagrams.
             y (n x 1 array): persistence diagram labels (unused).
         """
-        if np.isnan(np.array(self.ls_range)).any():
-            pre = DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(X,y)
-            [mx,my],[Mx,My] = pre.scalers[0][1].data_min_, pre.scalers[0][1].data_max_
-            self.ls_range = np.where(np.isnan(np.array(self.ls_range)), np.array([mx, My]), np.array(self.ls_range))
+        if np.isnan(np.array(self.sample_range)).any():
+            pre = DiagramScaler(use=True, scalers=[([0], MinMaxScaler()), ([1], MinMaxScaler())]).fit(X,y)
+            [mx,my],[Mx,My] = [pre.scalers[0][1].data_min_[0], pre.scalers[1][1].data_min_[0]], [pre.scalers[0][1].data_max_[0], pre.scalers[1][1].data_max_[0]]
+            self.sample_range = np.where(np.isnan(np.array(self.sample_range)), np.array([mx, My]), np.array(self.sample_range))
         return self
 
     def transform(self, X):
@@ -117,7 +117,7 @@ class Landscape(BaseEstimator, TransformerMixin):
             Xfit (numpy array with shape (number of diagrams) x (number of samples = **num_landscapes** x **resolution**)): output persistence landscapes.
         """
         num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.ls_range[0], self.ls_range[1], self.resolution)
+        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
         step_x = x_values[1] - x_values[0]
 
         for i in range(num_diag):
@@ -131,19 +131,19 @@ class Landscape(BaseEstimator, TransformerMixin):
                 events.append([])
 
             for j in range(num_pts_in_diag):
-                [px,py] = diagram[j,:]
-                min_idx = np.minimum(np.maximum(np.ceil((px          - self.ls_range[0]) / step_x).astype(int), 0), self.resolution)
-                mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.ls_range[0]) / step_x).astype(int), 0), self.resolution)
-                max_idx = np.minimum(np.maximum(np.ceil((py          - self.ls_range[0]) / step_x).astype(int), 0), self.resolution)
+                [px,py] = diagram[j,:2]
+                min_idx = np.minimum(np.maximum(np.ceil((px          - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                max_idx = np.minimum(np.maximum(np.ceil((py          - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
 
                 if min_idx < self.resolution and max_idx > 0:
 
-                    landscape_value = self.ls_range[0] + min_idx * step_x - px
+                    landscape_value = self.sample_range[0] + min_idx * step_x - px
                     for k in range(min_idx, mid_idx):
                         events[k].append(landscape_value)
                         landscape_value += step_x
 
-                    landscape_value = py - self.ls_range[0] - mid_idx * step_x
+                    landscape_value = py - self.sample_range[0] - mid_idx * step_x
                     for k in range(mid_idx, max_idx):
                         events[k].append(landscape_value)
                         landscape_value -= step_x
@@ -163,29 +163,29 @@ class Silhouette(BaseEstimator, TransformerMixin):
     """
     This is a class for computing persistence silhouettes from a list of persistence diagrams. A persistence silhouette is computed by taking a weighted average of the collection of 1D piecewise-linear functions given by the persistence landscapes, and then by uniformly sampling this average on a given range. Finally, the corresponding vector of samples is returned. See https://arxiv.org/abs/1312.0308 for more details.
     """
-    def __init__(self, weight=lambda x: 1, resolution=100, sh_range=[np.nan, np.nan]):
+    def __init__(self, weight=lambda x: 1, resolution=100, sample_range=[np.nan, np.nan]):
         """
         Constructor for the Silhouette class.
 
         Attributes:
             weight (function): weight function for the persistence diagram points (default constant function, ie lambda x: 1). This function must be defined on 2D points, ie on lists or numpy arrays of the form [p_x,p_y].
             resolution (int): number of samples for the weighted average (default 100).
-            sh_range ([double, double]): minimum and maximum for the weighted average domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
+            sample_range ([double, double]): minimum and maximum for the weighted average domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
         """
-        self.weight, self.resolution, self.sh_range = weight, resolution, sh_range
+        self.weight, self.resolution, self.sample_range = weight, resolution, sample_range
 
     def fit(self, X, y=None):
         """
-        Fit the Silhouette class on a list of persistence diagrams: if any of the values in **sh_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
+        Fit the Silhouette class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
 
         Parameters:
             X (list of n x 2 numpy arrays): input persistence diagrams.
             y (n x 1 array): persistence diagram labels (unused).
         """
-        if np.isnan(np.array(self.sh_range)).any():
-            pre = DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(X,y)
-            [mx,my],[Mx,My] = pre.scalers[0][1].data_min_, pre.scalers[0][1].data_max_
-            self.sh_range = np.where(np.isnan(np.array(self.sh_range)), np.array([mx, My]), np.array(self.sh_range))
+        if np.isnan(np.array(self.sample_range)).any():
+            pre = DiagramScaler(use=True, scalers=[([0], MinMaxScaler()), ([1], MinMaxScaler())]).fit(X,y)
+            [mx,my],[Mx,My] = [pre.scalers[0][1].data_min_[0], pre.scalers[1][1].data_min_[0]], [pre.scalers[0][1].data_max_[0], pre.scalers[1][1].data_max_[0]]
+            self.sample_range = np.where(np.isnan(np.array(self.sample_range)), np.array([mx, My]), np.array(self.sample_range))
         return self
 
     def transform(self, X):
@@ -199,7 +199,7 @@ class Silhouette(BaseEstimator, TransformerMixin):
             Xfit (numpy array with shape (number of diagrams) x (**resolution**): output persistence silhouettes.
         """
         num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.sh_range[0], self.sh_range[1], self.resolution)
+        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
         step_x = x_values[1] - x_values[0]
 
         for i in range(num_diag):
@@ -213,20 +213,20 @@ class Silhouette(BaseEstimator, TransformerMixin):
 
             for j in range(num_pts_in_diag):
 
-                [px,py] = diagram[j,:]
+                [px,py] = diagram[j,:2]
                 weight  = weights[j] / total_weight
-                min_idx = np.minimum(np.maximum(np.ceil((px          - self.sh_range[0]) / step_x).astype(int), 0), self.resolution)
-                mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.sh_range[0]) / step_x).astype(int), 0), self.resolution)
-                max_idx = np.minimum(np.maximum(np.ceil((py          - self.sh_range[0]) / step_x).astype(int), 0), self.resolution)
+                min_idx = np.minimum(np.maximum(np.ceil((px          - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                mid_idx = np.minimum(np.maximum(np.ceil((0.5*(py+px) - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                max_idx = np.minimum(np.maximum(np.ceil((py          - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
 
                 if min_idx < self.resolution and max_idx > 0:
 
-                    silhouette_value = self.sh_range[0] + min_idx * step_x - px
+                    silhouette_value = self.sample_range[0] + min_idx * step_x - px
                     for k in range(min_idx, mid_idx):
                         sh[k] += weight * silhouette_value
                         silhouette_value += step_x
 
-                    silhouette_value = py - self.sh_range[0] - mid_idx * step_x
+                    silhouette_value = py - self.sample_range[0] - mid_idx * step_x
                     for k in range(mid_idx, max_idx):
                         sh[k] += weight * silhouette_value
                         silhouette_value -= step_x
@@ -241,28 +241,28 @@ class BettiCurve(BaseEstimator, TransformerMixin):
     """
     This is a class for computing Betti curves from a list of persistence diagrams. A Betti curve is a 1D piecewise-constant function obtained from the rank function. It is sampled uniformly on a given range and the vector of samples is returned. See https://www.researchgate.net/publication/316604237_Time_Series_Classification_via_Topological_Data_Analysis for more details.
     """
-    def __init__(self, resolution=100, bc_range=[np.nan, np.nan]):
+    def __init__(self, resolution=100, sample_range=[np.nan, np.nan]):
         """
         Constructor for the BettiCurve class.
 
         Attributes:
             resolution (int): number of sample for the piecewise-constant function (default 100).
-            bc_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
+            sample_range ([double, double]): minimum and maximum of the piecewise-constant function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method.
         """
-        self.resolution, self.bc_range = resolution, bc_range
+        self.resolution, self.sample_range = resolution, sample_range
 
     def fit(self, X, y=None):
         """
-        Fit the BettiCurve class on a list of persistence diagrams: if any of the values in **bc_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
+        Fit the BettiCurve class on a list of persistence diagrams: if any of the values in **sample_range** is numpy.nan, replace it with the corresponding value computed on the given list of persistence diagrams.
 
         Parameters:
             X (list of n x 2 numpy arrays): input persistence diagrams.
             y (n x 1 array): persistence diagram labels (unused).
         """
-        if np.isnan(np.array(self.bc_range)).any():
-            pre = DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(X,y)
-            [mx,my],[Mx,My] = pre.scalers[0][1].data_min_, pre.scalers[0][1].data_max_
-            self.bc_range = np.where(np.isnan(np.array(self.bc_range)), np.array([mx, My]), np.array(self.bc_range))
+        if np.isnan(np.array(self.sample_range)).any():
+            pre = DiagramScaler(use=True, scalers=[([0], MinMaxScaler()), ([1], MinMaxScaler())]).fit(X,y)
+            [mx,my],[Mx,My] = [pre.scalers[0][1].data_min_[0], pre.scalers[1][1].data_min_[0]], [pre.scalers[0][1].data_max_[0], pre.scalers[1][1].data_max_[0]]
+            self.sample_range = np.where(np.isnan(np.array(self.sample_range)), np.array([mx, My]), np.array(self.sample_range))
         return self
 
     def transform(self, X):
@@ -276,7 +276,7 @@ class BettiCurve(BaseEstimator, TransformerMixin):
             Xfit (numpy array with shape (number of diagrams) x (**resolution**): output Betti curves.
         """
         num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.bc_range[0], self.bc_range[1], self.resolution)
+        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
         step_x = x_values[1] - x_values[0]
 
         for i in range(num_diag):
@@ -285,9 +285,9 @@ class BettiCurve(BaseEstimator, TransformerMixin):
 
             bc =  np.zeros(self.resolution)
             for j in range(num_pts_in_diag):
-                [px,py] = diagram[j,:]
-                min_idx = np.minimum(np.maximum(np.ceil((px - self.bc_range[0]) / step_x).astype(int), 0), self.resolution)
-                max_idx = np.minimum(np.maximum(np.ceil((py - self.bc_range[0]) / step_x).astype(int), 0), self.resolution)
+                [px,py] = diagram[j,:2]
+                min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
                 for k in range(min_idx, max_idx):
                     bc[k] += 1
 
@@ -301,7 +301,7 @@ class Entropy(BaseEstimator, TransformerMixin):
     """
     This is a class for computing persistence entropy. Persistence entropy is a statistic for persistence diagrams inspired from Shannon entropy. This statistic can also be used to compute a feature vector, called the entropy summary function. See https://arxiv.org/pdf/1803.08304.pdf for more details.
     """
-    def __init__(self, mode="scalar", normalized=True, resolution=100, ent_range=[np.nan, np.nan]):
+    def __init__(self, mode="scalar", normalized=True, resolution=100, sample_range=[np.nan, np.nan]):
         """
         Constructor for the Entropy class.
 
@@ -309,9 +309,9 @@ class Entropy(BaseEstimator, TransformerMixin):
             mode (string): what entropy to compute: either "scalar" for computing the entropy statistics, or "vector" for computing the entropy summary functions (default "scalar").
             normalized (bool): whether to normalize the entropy summary function (default True). Used only if **mode** = "vector". 
             resolution (int): number of sample for the entropy summary function (default 100). Used only if **mode** = "vector".
-            ent_range ([double, double]): minimum and maximum of the entropy summary function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. Used only if **mode** = "vector".
+            sample_range ([double, double]): minimum and maximum of the entropy summary function domain, of the form [x_min, x_max] (default [numpy.nan, numpy.nan]). It is the interval on which samples will be drawn uniformly. If one of the values is numpy.nan, it can be computed from the persistence diagrams with the fit() method. Used only if **mode** = "vector".
         """
-        self.mode, self.normalized, self.resolution, self.ent_range = mode, normalized, resolution, ent_range
+        self.mode, self.normalized, self.resolution, self.sample_range = mode, normalized, resolution, sample_range
 
     def fit(self, X, y=None):
         """
@@ -321,10 +321,10 @@ class Entropy(BaseEstimator, TransformerMixin):
             X (list of n x 2 numpy arrays): input persistence diagrams.
             y (n x 1 array): persistence diagram labels (unused).
         """
-        if np.isnan(np.array(self.ent_range)).any():
-            pre = DiagramPreprocessor(use=True, scalers=[([0,1], MinMaxScaler())]).fit(X,y)
-            [mx,my],[Mx,My] = pre.scalers[0][1].data_min_, pre.scalers[0][1].data_max_
-            self.ent_range = np.where(np.isnan(np.array(self.ent_range)), np.array([mx, My]), np.array(self.ent_range))
+        if np.isnan(np.array(self.sample_range)).any():
+            pre = DiagramScaler(use=True, scalers=[([0], MinMaxScaler()), ([1], MinMaxScaler())]).fit(X,y)
+            [mx,my],[Mx,My] = [pre.scalers[0][1].data_min_[0], pre.scalers[1][1].data_min_[0]], [pre.scalers[0][1].data_max_[0], pre.scalers[1][1].data_max_[0]]
+            self.sample_range = np.where(np.isnan(np.array(self.sample_range)), np.array([mx, My]), np.array(self.sample_range))
         return self
 
     def transform(self, X):
@@ -338,14 +338,14 @@ class Entropy(BaseEstimator, TransformerMixin):
             Xfit (numpy array with shape (number of diagrams) x (1 if **mode** = "scalar" else **resolution**)): output entropy.
         """
         num_diag, Xfit = len(X), []
-        x_values = np.linspace(self.ent_range[0], self.ent_range[1], self.resolution)
+        x_values = np.linspace(self.sample_range[0], self.sample_range[1], self.resolution)
         step_x = x_values[1] - x_values[0]
-        new_X = DiagramPreprocessor(use=True, scalers=[([0,1], BirthPersistenceTransform())]).fit_transform(X)        
+        new_X = BirthPersistenceTransform().fit_transform(X)        
 
         for i in range(num_diag):
 
             orig_diagram, diagram, num_pts_in_diag = X[i], new_X[i], X[i].shape[0]
-            new_diagram = DiagramPreprocessor(use=True, scalers=[([1], MaxAbsScaler())]).fit_transform([diagram])[0]
+            new_diagram = DiagramScaler(use=True, scalers=[([1], MaxAbsScaler())]).fit_transform([diagram])[0]
 
             if self.mode == "scalar":
                 ent = - np.sum( np.multiply(new_diagram[:,1], np.log(new_diagram[:,1])) )
@@ -354,9 +354,9 @@ class Entropy(BaseEstimator, TransformerMixin):
             else:
                 ent = np.zeros(self.resolution)
                 for j in range(num_pts_in_diag):
-                    [px,py] = orig_diagram[j,:]
-                    min_idx = np.minimum(np.maximum(np.ceil((px - self.ent_range[0]) / step_x).astype(int), 0), self.resolution)
-                    max_idx = np.minimum(np.maximum(np.ceil((py - self.ent_range[0]) / step_x).astype(int), 0), self.resolution)
+                    [px,py] = orig_diagram[j,:2]
+                    min_idx = np.minimum(np.maximum(np.ceil((px - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
+                    max_idx = np.minimum(np.maximum(np.ceil((py - self.sample_range[0]) / step_x).astype(int), 0), self.resolution)
                     for k in range(min_idx, max_idx):
                         ent[k] += (-1) * new_diagram[j,1] * np.log(new_diagram[j,1])
                     if self.normalized:
@@ -411,7 +411,7 @@ class TopologicalVector(BaseEstimator, TransformerMixin):
         for i in range(num_diag):
 
             diagram, num_pts_in_diag = X[i], X[i].shape[0]
-            pers = 0.5 * np.matmul(diagram, np.array([[-1.0],[1.0]]))
+            pers = 0.5 * (diagram[:,1]-diagram[:,0])
             min_pers = np.minimum(pers,np.transpose(pers))
             distances = DistanceMetric.get_metric("chebyshev").pairwise(diagram)
             vect = np.flip(np.sort(np.triu(np.minimum(distances, min_pers)), axis=None), 0)
author	mathieu <mathieu.carriere3@gmail.com>	2019-09-09 16:17:28 -0400
committer	mathieu <mathieu.carriere3@gmail.com>	2019-09-09 16:17:28 -0400
commit	5e8144e8ddb4a39f9f4e60a9fafb5cc57f96acac (patch)
tree	81922ed73c09df253ddb78440d8eb5be298961c3 /src
parent	c18017f78779239cf17dc1a9b0d00a9b8f075a2d (diff)