-
Notifications
You must be signed in to change notification settings - Fork 32
/
url_utils.py
115 lines (83 loc) · 3.54 KB
/
url_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#
# url_utils.py
#
# Frequently-used functions for downloading or manipulating URLs
#
#%% Imports
import re
import urllib
import os
import tempfile
from urllib.parse import urlparse
# pip install progressbar2
import progressbar
ai4e_utils_temp_dir = None
max_path_len = 255
#%% Download functions
class DownloadProgressBar():
"""
https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
"""
def __init__(self):
self.pbar = None
def __call__(self, block_num, block_size, total_size):
if not self.pbar:
self.pbar = progressbar.ProgressBar(max_value=total_size)
self.pbar.start()
downloaded = block_num * block_size
if downloaded < total_size:
self.pbar.update(downloaded)
else:
self.pbar.finish()
def get_temp_folder(preferred_name='ai4eutils'):
global ai4e_utils_temp_dir
if ai4e_utils_temp_dir is None:
ai4e_utils_temp_dir = os.path.join(tempfile.gettempdir(),preferred_name)
os.makedirs(ai4e_utils_temp_dir,exist_ok=True)
return ai4e_utils_temp_dir
def download_url(url, destination_filename=None, progress_updater=None,
force_download=False, verbose=True):
"""
Download a URL to a file. If no file is specified, creates a temporary file,
with a semi-best-effort to avoid filename collisions.
Prints some diagnostic information and makes sure to omit SAS tokens from printouts.
progress_updater can be "None", "True", or a specific callback.
"""
if progress_updater is not None and isinstance(progress_updater,bool):
if not progress_updater:
progress_updater = None
else:
progress_updater = DownloadProgressBar()
url_no_sas = url.split('?')[0]
if destination_filename is None:
target_folder = get_temp_folder()
url_without_sas = url.split('?', 1)[0]
# This does not guarantee uniqueness, hence "semi-best-effort"
url_as_filename = re.sub(r'\W+', '', url_without_sas)
n_folder_chars = len(ai4e_utils_temp_dir)
if len(url_as_filename) + n_folder_chars > max_path_len:
print('Warning: truncating filename target to {} characters'.format(max_path_len))
url_as_filename = url_as_filename[-1*(max_path_len-n_folder_chars):]
destination_filename = \
os.path.join(target_folder,url_as_filename)
if (not force_download) and (os.path.isfile(destination_filename)):
if verbose:
print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url_no_sas)))
else:
if verbose:
print('Downloading file {} to {}'.format(os.path.basename(url_no_sas),destination_filename),end='')
urllib.request.urlretrieve(url, destination_filename, progress_updater)
assert(os.path.isfile(destination_filename))
nBytes = os.path.getsize(destination_filename)
if verbose:
print('...done, {} bytes.'.format(nBytes))
return destination_filename
def download_relative_filename(url, output_base, verbose=False):
"""
Download a URL to output_base, preserving relative path
"""
p = urlparse(url)
# remove the leading '/'
assert p.path.startswith('/'); relative_filename = p.path[1:]
destination_filename = os.path.join(output_base,relative_filename)
download_url(url, destination_filename, verbose=verbose)