summaryrefslogtreecommitdiff
path: root/src/python/gudhi/point_cloud
diff options
context:
space:
mode:
authorMarc Glisse <marc.glisse@inria.fr>2020-04-12 21:57:51 +0200
committerMarc Glisse <marc.glisse@inria.fr>2020-04-12 21:57:51 +0200
commit83a1bc1fb6124a35d515f4836d2e830f3dbdf0e7 (patch)
tree09f15b27fac2f7279788c1d0db3c03cdeb12b4f7 /src/python/gudhi/point_cloud
parentf9a933862050ca95b3a96d7a8572d62f7f2205a9 (diff)
Parallelize the "precomputed" case of knn
It is supposed to be possible to compile numpy with openmp, but it looks like it isn't done in any of the usual packages. It may be possible to refactor that code so there is less redundancy.
Diffstat (limited to 'src/python/gudhi/point_cloud')
-rw-r--r--src/python/gudhi/point_cloud/knn.py78
1 files changed, 62 insertions, 16 deletions
diff --git a/src/python/gudhi/point_cloud/knn.py b/src/python/gudhi/point_cloud/knn.py
index 6642a3c2..f6870517 100644
--- a/src/python/gudhi/point_cloud/knn.py
+++ b/src/python/gudhi/point_cloud/knn.py
@@ -115,25 +115,71 @@ class KNearestNeighbors:
if metric == "precomputed":
# scikit-learn could handle that, but they insist on calling fit() with an unused square array, which is too unnatural.
- X = numpy.array(X)
if self.return_index:
- neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
- if self.params.get("sort_results", True):
- X = numpy.take_along_axis(X, neighbors, axis=-1)
- ngb_order = numpy.argsort(X, axis=-1)
- neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+ n_jobs = self.params.get("n_jobs", 1)
+ # Supposedly numpy can be compiled with OpenMP and handle this, but nobody does that?!
+ if n_jobs == 1:
+ neighbors = numpy.argpartition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results", True):
+ X = numpy.take_along_axis(X, neighbors, axis=-1)
+ ngb_order = numpy.argsort(X, axis=-1)
+ neighbors = numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+ else:
+ ngb_order = neighbors
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, ngb_order, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
else:
- ngb_order = neighbors
- if self.return_distance:
- distances = numpy.take_along_axis(X, ngb_order, axis=-1)
- return neighbors, distances
- else:
- return neighbors
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ slices = gen_even_slices(len(X), effective_n_jobs(-1))
+ parallel = Parallel(backend="threading", n_jobs=-1)
+ if self.params.get("sort_results", True):
+
+ def func(M):
+ neighbors = numpy.argpartition(M, k - 1)[:, 0:k]
+ Y = numpy.take_along_axis(M, neighbors, axis=-1)
+ ngb_order = numpy.argsort(Y, axis=-1)
+ return numpy.take_along_axis(neighbors, ngb_order, axis=-1)
+
+ else:
+
+ def func(M):
+ return numpy.argpartition(M, k - 1)[:, 0:k]
+
+ neighbors = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
+ if self.return_distance:
+ distances = numpy.take_along_axis(X, neighbors, axis=-1)
+ return neighbors, distances
+ else:
+ return neighbors
if self.return_distance:
- distances = numpy.partition(X, k - 1)[:, 0:k]
- if self.params.get("sort_results"):
- # partition is not guaranteed to sort the lower half, although it often does
- distances.sort(axis=-1)
+ n_jobs = self.params.get("n_jobs", 1)
+ if n_jobs == 1:
+ distances = numpy.partition(X, k - 1)[:, 0:k]
+ if self.params.get("sort_results"):
+ # partition is not guaranteed to sort the lower half, although it often does
+ distances.sort(axis=-1)
+ else:
+ from joblib import Parallel, delayed, effective_n_jobs
+ from sklearn.utils import gen_even_slices
+
+ if self.params.get("sort_results"):
+
+ def func(M):
+ # Not partitioning in place, because we should not modify the user's array?
+ r = numpy.partition(M, k - 1)[:, 0:k]
+ r.sort(axis=-1)
+ return r
+
+ else:
+ func = lambda M: numpy.partition(M, k - 1)[:, 0:k]
+ slices = gen_even_slices(len(X), effective_n_jobs(-1))
+ parallel = Parallel(backend="threading", n_jobs=-1)
+ distances = numpy.concatenate(parallel(delayed(func)(X[s]) for s in slices))
return distances
return None