summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/python/CMakeLists.txt2
-rw-r--r--src/python/gudhi/datasets/remote.py85
-rw-r--r--src/python/test/test_remote_datasets.py22
3 files changed, 109 insertions, 0 deletions
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 98f2b85f..6f117588 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -542,6 +542,8 @@ if(PYTHONINTERP_FOUND)
add_gudhi_py_test(test_dtm_rips_complex)
endif()
+ # Fetch remote datasets
+ add_gudhi_py_test(test_remote_datasets)
# Set missing or not modules
set(GUDHI_MODULES ${GUDHI_MODULES} "python" CACHE INTERNAL "GUDHI_MODULES")
diff --git a/src/python/gudhi/datasets/remote.py b/src/python/gudhi/datasets/remote.py
new file mode 100644
index 00000000..27076785
--- /dev/null
+++ b/src/python/gudhi/datasets/remote.py
@@ -0,0 +1,85 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Hind Montassif
+#
+# Copyright (C) 2021 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+import hashlib
+
+from os.path import join, exists
+from os import makedirs
+
+from urllib.request import urlretrieve
+
+
+def _checksum_sha256(file_path):
+ """
+ Compute the file checksum using sha256
+
+ Parameters
+ ----------
+ file_path: string
+ Full path of the created file.
+
+ Returns
+ -------
+ The hex digest of file_path
+ """
+ sha256_hash = hashlib.sha256()
+ chunk_size = 4096
+ with open(file_path,"rb") as f:
+ # Read and update hash string value in blocks of 4K
+ while True:
+ buffer = f.read(chunk_size)
+ if not buffer:
+ break
+ sha256_hash.update(buffer)
+ return sha256_hash.hexdigest()
+
+def fetch(url, filename, dirname = "remote_datasets", checksum_flag = False, file_checksum = None):
+ """
+ Fetch the wanted dataset from the given url and save it in file_path
+
+ Parameters
+ ----------
+ url : string
+ The url to fetch the dataset from
+ filename : string
+ The filename to download
+ dirname : string
+ The directory to save the file to.
+ checksum_flag : boolean
+ To set if the user wants the file checksum. Default is 'False'.
+ Note that if checksum_flag is set to 'True', the file_checksum must be provided.
+ file_checksum : string
+ The file checksum using sha256 to check against the one computed on the downloaded file.
+ To be considered, checksum_flag must be set to 'True'.
+ Default is 'None'.
+
+ Returns
+ -------
+ file_path: string
+ Full path of the created file.
+ """
+ if not exists(dirname):
+ makedirs(dirname)
+
+ file_path = join(dirname, filename)
+
+ urlretrieve(url, file_path)
+
+ if (checksum_flag):
+ if file_checksum is None:
+ raise ValueError("The file checksum must be provided - different from None - for the check to be performed.")
+
+ checksum = _checksum_sha256(file_path)
+ if file_checksum != checksum:
+ raise IOError("{} has a SHA256 checksum : {}, "
+ "different from expected : {}."
+ "The file may be corrupted or the given url may be wrong !".format(file_path, checksum,
+ file_checksum))
+
+ return file_path
diff --git a/src/python/test/test_remote_datasets.py b/src/python/test/test_remote_datasets.py
new file mode 100644
index 00000000..c4e752a7
--- /dev/null
+++ b/src/python/test/test_remote_datasets.py
@@ -0,0 +1,22 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Hind Montassif
+#
+# Copyright (C) 2021 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+
+from gudhi.datasets import remote
+
+def test_fetch_remote_datasets():
+ # Test files download from given urls
+ assert 'remote_datasets/spiral_2d.csv' == remote.fetch("https://raw.githubusercontent.com/Hind-M/gudhi-data/main/spiral_2d.csv", "spiral_2d.csv")
+ assert 'remote_datasets/sphere3D_pts_on_grid.off' == remote.fetch("https://raw.githubusercontent.com/Hind-M/gudhi-data/main/sphere3D_pts_on_grid.off", "sphere3D_pts_on_grid.off")
+
+ # Test files download with checksums provided
+ assert 'remote_datasets/spiral_2d.csv' == remote.fetch("https://raw.githubusercontent.com/Hind-M/gudhi-data/main/spiral_2d.csv", "spiral_2d.csv", checksum_flag = True,
+ file_checksum = '37530355d980d957c4ec06b18c775f90a91e446107d06c6201c9b4000b077f38')
+ assert 'remote_datasets/sphere3D_pts_on_grid.off' == remote.fetch("https://raw.githubusercontent.com/Hind-M/gudhi-data/main/sphere3D_pts_on_grid.off", "sphere3D_pts_on_grid.off",
+ checksum_flag = True, file_checksum = '32f96d2cafb1177f0dd5e0a019b6ff5658e14a619a7815ae55ad0fc5e8bd3f88')