-
Notifications
You must be signed in to change notification settings - Fork 0
/
mega_scraper.py
284 lines (250 loc) · 11.9 KB
/
mega_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
"""MegaScraper: command-line tool to download images from a single website.
MegaScraper allows to recursively download all images from a given
website starting from a single URL (i.e., the seed). The user can
provide regexes for both the pages to download images from and
for the images URLs, so that only the relevant resources are actually
downloaded.
Additionally, the user can provide min width and height for the
images, and decide on how the output folder should be organized.
Examples:
>>> python mega_scraper.py -s https://mugshots.com/ -mp 2 -hm 10
"""
import argparse
import os
import re
import urllib.parse
import bs4
import PIL.Image
import requests
_DEFAULT_REGEX_PAGES = r''
_DEFAULT_REGEX_IMAGES = r''
_DEFAULT_MIN_WIDTH = 0
_DEFAULT_MIN_HEIGHT = 0
_DEFAULT_OUTPUT_FOLDERPATH = 'scraped'
_DEFAULT_OUTPUT_STRUCTURE = 'flat'
_DEFAULT_OUTPUT_NAMING = 'keep'
_DEFAULT_IMAGES_PER_FOLDER = 100
_DEFAULT_FOLDER_INITIAL_NUM = 1
_DEFAULT_MAX_PAGES = 99999
_DEFAULT_HOW_MANY = 99999 # Number of images to download per call to download().
_OUTPUT_STRUCTURE_VALUES = ('flat', 'grouped')
_OUTPUT_NAMING_VALUES = ('keep', 'numerical')
class MegaScraper(object):
"""Scraper that downloads images from a website iteratively."""
def __init__(
self,
seed: str,
regex_pages: str = _DEFAULT_REGEX_PAGES,
regex_images: str = _DEFAULT_REGEX_IMAGES,
min_width: int = _DEFAULT_MIN_WIDTH,
min_height: int = _DEFAULT_MIN_HEIGHT,
output_folderpath: str = _DEFAULT_OUTPUT_FOLDERPATH,
output_structure: str = _DEFAULT_OUTPUT_STRUCTURE,
output_naming: str = _DEFAULT_OUTPUT_NAMING,
images_per_folder: int = _DEFAULT_IMAGES_PER_FOLDER,
folder_initial_num: int = _DEFAULT_FOLDER_INITIAL_NUM):
"""Constructor for MegaScraper.
Args:
seed: the URL to start the navigation from. For MegaScraper
to work effectively, all URLs on the site must be
reachable statrinf from the seed.
regex_pages: only pages that match this regex will be
downloaded resources from. Note that pages that don't
match this regex will still be scraped for URLs.
regex_images: only images URLs that match this regex
will be considered.
min_width: minimum width for an image to be downloaded.
max_height: minimum height for an image to be downloaded.
output_folderpath: the folder where to download the images.
output_structure: one between ('flat', 'grouped'). If 'flat',
then all the images are downloaded in the same folder. If
'grouped', then the images are grouped into sub-folders.
output_naming: one between ('keep', 'numerical'). If 'keep',
then the original name of the file is used. If 'numerical',
then images are renamed '1.jpg', '2.jpg', etc.
images_per_folder: if output_structure is 'grouped', then this
parameter dictates how many images should be grouped
in a single folder.
folder_initial_num: if output_structure is 'grouped', then
this paramter dictates the number-name of the first folder.
The numbers are always padded to have length 4.
"""
assert output_structure in _OUTPUT_STRUCTURE_VALUES
assert output_naming in _OUTPUT_NAMING_VALUES
assert images_per_folder > 0
assert folder_initial_num > 0
self._seed = seed
self._parsed_seed = urllib.parse.urlparse(seed)
self._root = '{u.scheme}://{u.netloc}'.format(u=self._parsed_seed)
print('--The root of the seed appears to be: {}'.format(self._root))
self._regex_pages = regex_pages
self._regex_images = regex_images
self._min_width = min_width
self._min_height = min_height
self._output_folderpath = output_folderpath
self._output_structure = output_structure
self._output_naming = output_naming
self._images_per_folder = images_per_folder
self._folder_initial_num = folder_initial_num
self._explored = set()
self._unexplored = set([seed])
self._images_urls = set()
self._downloaded = set()
self._downloaded_idx = 1
def _extract_unexplored_pages(self, soup: bs4.BeautifulSoup) -> set:
"""Extracts all unxplored pages (URLs) from a soup.
Args:
soup: the BeautifulSoup() to extract pages from.
Returns:
A set of strings, each being the URL of an unexplored page.
"""
unexplored_pages = set()
elements = soup.find_all('a')
for element in elements:
href = element.get('href')
if href:
if href.startswith('/'):
href = self._root + href
href_netloc = urllib.parse.urlparse(href).netloc
if (href not in self._explored
and href_netloc == self._parsed_seed.netloc):
unexplored_pages.add(href)
return unexplored_pages
def _extract_images_urls(self, soup: bs4.BeautifulSoup) -> set:
"""Extracts all images URLs from a soup.
Args:
soup: the BeautifulSoup() to extract images URLs from.
Returns:
A set of strings, each being the URL of an image.
"""
images_urls = set()
elements = soup.find_all('img')
for element in elements:
src = element.get('src')
if src and not src.endswith('.gif'):
if src.startswith('/'):
src = self._root + src
if re.search(self._regex_images, src):
images_urls.add(src)
return images_urls
def scrape(self, max_pages: int = _DEFAULT_MAX_PAGES) -> set:
"""Scrapes all images URLs from unexplored pages.
Note that this method only scrapes images URLs, it doesn't
download the images themselves. After calling scrape(),
download() must be used to actually download the resources. It's
not important to store the output of scrape() as the images URLs
are saved within the object itself.
Args:
max_pages: the maximum number of pages to scrape URLs from.
Returns:
Set of strings, each being the URL of an image. If scrape()
is called twice, the sets are going to be different as
MegaScraper() memorizes what URLs have been returned before.
"""
assert isinstance(max_pages, int)
assert max_pages >= 1
num_pages = 0
images_urls = set()
while self._unexplored:
url = self._unexplored.pop()
print('--Processing the following URL: {}'.format(url))
html = requests.get(url).content
soup = bs4.BeautifulSoup(html, features='html.parser')
self._unexplored.update(self._extract_unexplored_pages(soup))
if re.search(self._regex_pages, url):
images_urls.update(self._extract_images_urls(soup))
self._explored.add(url)
num_pages += 1
if num_pages == max_pages:
break
new_images_urls = images_urls.difference(self._images_urls)
self._images_urls.update(images_urls)
return new_images_urls
def download(self, how_many: int = _DEFAULT_HOW_MANY) -> None:
"""Downloads the images from the URLs scraped via scrape().
Args:
how_many: how many images to download.
"""
assert isinstance(how_many, int)
if not os.path.exists(self._output_folderpath):
os.makedirs(self._output_folderpath)
for _ in range(how_many):
images_to_download = self._images_urls.difference(self._downloaded)
if not images_to_download:
break
image_url = images_to_download.pop()
image = PIL.Image.open(requests.get(image_url, stream=True).raw)
self._downloaded.add(image_url)
if image.width >= self._min_width and image.height >= self._min_height:
# Decides the name of the downloaded image.
if self._output_naming == 'keep':
image_filename = image_url.split('/')[-1]
elif self._output_naming == 'numerical':
image_filename = str(self._downloaded_idx) + '.jpg'
# Decides the folder structure (if any) of the output.
if self._output_structure == 'flat':
image_filepath = os.path.join(
self._output_folderpath, image_filename)
elif self._output_structure == 'grouped':
image_foldername = str(
int(
(self._downloaded_idx - 1) / self._images_per_folder
) + self._folder_initial_num
).zfill(4)
image_folderpath = os.path.join(
self._output_folderpath, image_foldername)
if not os.path.exists(image_folderpath):
os.makedirs(image_folderpath)
image_filepath = os.path.join(image_folderpath, image_filename)
# Finally, it saves the image.
image.save(image_filepath)
self._downloaded_idx += 1
raw_message = '--The following image was downloaded successfully: {}'
print(raw_message.format(image_url))
else:
raw_message = '--The following image was skipped because of its size: {}'
print(raw_message.format(image_url))
def parse_args() -> argparse.Namespace:
"""Parses the arguments provided.
Only relevant if MegaScraper is used via command-line interface. If
you're importing mega_scraper.py to use it as a library feel free
to ignore this function.
For a description of the args, please see the docstring for MegaScraper().
Returns:
The args as a dict-like structure (Namespace).
"""
parser = argparse.ArgumentParser()
parser.add_argument('--seed', '-s', type=str, required=True)
parser.add_argument('--regex_pages', '-rp', type=str, default=_DEFAULT_REGEX_PAGES)
parser.add_argument('--regex_images', '-ri', type=str, default=_DEFAULT_REGEX_IMAGES)
parser.add_argument('--min_width', '-mw', type=int, default=_DEFAULT_MIN_WIDTH)
parser.add_argument('--min_height', '-mh', type=int, default=_DEFAULT_MIN_HEIGHT)
parser.add_argument('--output_folderpath', '-of', type=str, default=_DEFAULT_OUTPUT_FOLDERPATH)
parser.add_argument('--output_structure', '-os', type=str,
default=_DEFAULT_OUTPUT_STRUCTURE, choices=_OUTPUT_STRUCTURE_VALUES)
parser.add_argument('--output_naming', '-on', type=str,
default=_DEFAULT_OUTPUT_NAMING, choices=_OUTPUT_NAMING_VALUES)
parser.add_argument('--images_per_folder', '-if', type=int, default=_DEFAULT_IMAGES_PER_FOLDER)
parser.add_argument('--folder_initial_num', '-fn', type=int,
default=_DEFAULT_FOLDER_INITIAL_NUM)
parser.add_argument('--max_pages', '-mp', type=int, default=_DEFAULT_MAX_PAGES)
parser.add_argument('--how_many', '-hm', type=int, default=_DEFAULT_HOW_MANY)
return parser.parse_args()
def main():
"""Main for MegaScraper."""
args = parse_args()
scraper = MegaScraper(
seed=args.seed,
regex_pages=args.regex_pages,
regex_images=args.regex_images,
min_width=args.min_width,
min_height=args.min_height,
output_folderpath=args.output_folderpath,
output_structure=args.output_structure,
output_naming=args.output_naming,
images_per_folder=args.images_per_folder,
folder_initial_num=args.folder_initial_num)
scraper.scrape(args.max_pages)
scraper.download(args.how_many)
if __name__ == '__main__':
main()