summaryrefslogtreecommitdiff
path: root/src/python/gudhi/datasets/remote.py
blob: b266467dc32b6176d24782a2c68eaf06e77bb604 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
# Author(s):       Hind Montassif
#
# Copyright (C) 2021 Inria
#
# Modification(s):
#   - YYYY/MM Author: Description of the modification

import hashlib

from os.path import join, exists
from os import makedirs

from urllib.request import urlretrieve


def _checksum_sha256(file_path):
    """
    Compute the file checksum using sha256

    Parameters
    ----------
    file_path: string
        Full path of the created file.

    Returns
    -------
        The hex digest of file_path
    """
    sha256_hash = hashlib.sha256()
    chunk_size = 4096
    with open(file_path,"rb") as f:
        # Read and update hash string value in blocks of 4K
        while True:
            buffer = f.read(chunk_size)
            if not buffer:
                break
            sha256_hash.update(buffer)
    return sha256_hash.hexdigest()

def fetch(url, filename, dirname = "remote_datasets", file_checksum = None):
    """
    Fetch the wanted dataset from the given url and save it in file_path

    Parameters
    ----------
    url : string
        The url to fetch the dataset from.
    filename : string
        The name to give to downloaded file.
    dirname : string
        The directory to save the file to. Default is "remote_datasets".
    file_checksum : string
        The file checksum using sha256 to check against the one computed on the downloaded file.
        Default is 'None'.

    Returns
    -------
    file_path: string
        Full path of the created file.
    """

    file_path = join(dirname, filename)

    # Check that an existing file does not already exist at file_path
    if not exists(file_path):
        # Create directory if not existing
        if not exists(dirname):
            makedirs(dirname)

        urlretrieve(url, file_path)

        if file_checksum is not None:
            checksum = _checksum_sha256(file_path)
            if file_checksum != checksum:
                raise IOError("{} has a SHA256 checksum : {}, "
                            "different from expected : {}."
                            "The file may be corrupted or the given url may be wrong !".format(file_path, checksum,
                                                        file_checksum))

    return file_path

def fetch_spiral_2d(filename = "spiral_2d.csv", dirname = "remote_datasets"):
    """
    Fetch spiral_2d.csv remotely

    Parameters
    ----------
    filename : string
        The name to give to downloaded file. Default is "spiral_2d.csv"
    dirname : string
        The directory to save the file to. Default is "remote_datasets".

    Returns
    -------
    file_path: string
        Full path of the created file.
    """
    return fetch("https://raw.githubusercontent.com/GUDHI/gudhi-data/main/points/spiral_2d.csv", filename, dirname,
                 '37530355d980d957c4ec06b18c775f90a91e446107d06c6201c9b4000b077f38')