summaryrefslogtreecommitdiff
path: root/src/python/gudhi/datasets
diff options
context:
space:
mode:
authorHind-M <hind.montassif@gmail.com>2021-06-01 18:39:31 +0200
committerHind-M <hind.montassif@gmail.com>2021-06-01 18:39:31 +0200
commitf0c12fbdce04d09bf13b141d549e5e385c64caad (patch)
treecd7d11539462a1c93c0af82c8a61bc1fcd58bc4b /src/python/gudhi/datasets
parent07b103c924ff57de1f5aea354186a5392bfc01e2 (diff)
First version allowing to fetch remote datasets
Diffstat (limited to 'src/python/gudhi/datasets')
-rw-r--r--src/python/gudhi/datasets/remote.py85
1 files changed, 85 insertions, 0 deletions
diff --git a/src/python/gudhi/datasets/remote.py b/src/python/gudhi/datasets/remote.py
new file mode 100644
index 00000000..27076785
--- /dev/null
+++ b/src/python/gudhi/datasets/remote.py
@@ -0,0 +1,85 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s): Hind Montassif
+#
+# Copyright (C) 2021 Inria
+#
+# Modification(s):
+# - YYYY/MM Author: Description of the modification
+
+import hashlib
+
+from os.path import join, exists
+from os import makedirs
+
+from urllib.request import urlretrieve
+
+
+def _checksum_sha256(file_path):
+ """
+ Compute the file checksum using sha256
+
+ Parameters
+ ----------
+ file_path: string
+ Full path of the created file.
+
+ Returns
+ -------
+ The hex digest of file_path
+ """
+ sha256_hash = hashlib.sha256()
+ chunk_size = 4096
+ with open(file_path,"rb") as f:
+ # Read and update hash string value in blocks of 4K
+ while True:
+ buffer = f.read(chunk_size)
+ if not buffer:
+ break
+ sha256_hash.update(buffer)
+ return sha256_hash.hexdigest()
+
+def fetch(url, filename, dirname = "remote_datasets", checksum_flag = False, file_checksum = None):
+ """
+ Fetch the wanted dataset from the given url and save it in file_path
+
+ Parameters
+ ----------
+ url : string
+ The url to fetch the dataset from
+ filename : string
+ The filename to download
+ dirname : string
+ The directory to save the file to.
+ checksum_flag : boolean
+ To set if the user wants the file checksum. Default is 'False'.
+ Note that if checksum_flag is set to 'True', the file_checksum must be provided.
+ file_checksum : string
+ The file checksum using sha256 to check against the one computed on the downloaded file.
+ To be considered, checksum_flag must be set to 'True'.
+ Default is 'None'.
+
+ Returns
+ -------
+ file_path: string
+ Full path of the created file.
+ """
+ if not exists(dirname):
+ makedirs(dirname)
+
+ file_path = join(dirname, filename)
+
+ urlretrieve(url, file_path)
+
+ if (checksum_flag):
+ if file_checksum is None:
+ raise ValueError("The file checksum must be provided - different from None - for the check to be performed.")
+
+ checksum = _checksum_sha256(file_path)
+ if file_checksum != checksum:
+ raise IOError("{} has a SHA256 checksum : {}, "
+ "different from expected : {}."
+ "The file may be corrupted or the given url may be wrong !".format(file_path, checksum,
+ file_checksum))
+
+ return file_path