diff options
author | Hind-M <hind.montassif@gmail.com> | 2021-06-01 18:39:31 +0200 |
---|---|---|
committer | Hind-M <hind.montassif@gmail.com> | 2021-06-01 18:39:31 +0200 |
commit | f0c12fbdce04d09bf13b141d549e5e385c64caad (patch) | |
tree | cd7d11539462a1c93c0af82c8a61bc1fcd58bc4b /src/python/gudhi/datasets/remote.py | |
parent | 07b103c924ff57de1f5aea354186a5392bfc01e2 (diff) |
First version allowing to fetch remote datasets
Diffstat (limited to 'src/python/gudhi/datasets/remote.py')
-rw-r--r-- | src/python/gudhi/datasets/remote.py | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/src/python/gudhi/datasets/remote.py b/src/python/gudhi/datasets/remote.py new file mode 100644 index 00000000..27076785 --- /dev/null +++ b/src/python/gudhi/datasets/remote.py @@ -0,0 +1,85 @@ +# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT. +# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details. +# Author(s): Hind Montassif +# +# Copyright (C) 2021 Inria +# +# Modification(s): +# - YYYY/MM Author: Description of the modification + +import hashlib + +from os.path import join, exists +from os import makedirs + +from urllib.request import urlretrieve + + +def _checksum_sha256(file_path): + """ + Compute the file checksum using sha256 + + Parameters + ---------- + file_path: string + Full path of the created file. + + Returns + ------- + The hex digest of file_path + """ + sha256_hash = hashlib.sha256() + chunk_size = 4096 + with open(file_path,"rb") as f: + # Read and update hash string value in blocks of 4K + while True: + buffer = f.read(chunk_size) + if not buffer: + break + sha256_hash.update(buffer) + return sha256_hash.hexdigest() + +def fetch(url, filename, dirname = "remote_datasets", checksum_flag = False, file_checksum = None): + """ + Fetch the wanted dataset from the given url and save it in file_path + + Parameters + ---------- + url : string + The url to fetch the dataset from + filename : string + The filename to download + dirname : string + The directory to save the file to. + checksum_flag : boolean + To set if the user wants the file checksum. Default is 'False'. + Note that if checksum_flag is set to 'True', the file_checksum must be provided. + file_checksum : string + The file checksum using sha256 to check against the one computed on the downloaded file. + To be considered, checksum_flag must be set to 'True'. + Default is 'None'. + + Returns + ------- + file_path: string + Full path of the created file. + """ + if not exists(dirname): + makedirs(dirname) + + file_path = join(dirname, filename) + + urlretrieve(url, file_path) + + if (checksum_flag): + if file_checksum is None: + raise ValueError("The file checksum must be provided - different from None - for the check to be performed.") + + checksum = _checksum_sha256(file_path) + if file_checksum != checksum: + raise IOError("{} has a SHA256 checksum : {}, " + "different from expected : {}." + "The file may be corrupted or the given url may be wrong !".format(file_path, checksum, + file_checksum)) + + return file_path |