-
Notifications
You must be signed in to change notification settings - Fork 0
/
band_page_scraper.py
711 lines (592 loc) · 32.2 KB
/
band_page_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
import argparse, os, re, collections
import sqlite3 as lite
from pprint import pprint
import logging
logger = logging.getLogger(__name__)
import bs4, tqdm
from baseScraper import BaseScraper
from utils import *
MemberStatus = collections.namedtuple('MemberStatus',
('current',
'current_live',
'past',
'past_live',
'last_known',
'last_known_live',
))
class BandPageScraper(BaseScraper):
def __init__(self,
database_filename,
only_if_not_scraped=False,
offset=0,
limit=-1,
reviews_gt=0,
order_by_reviews=False,
order_by_insert_date=False,
skip_band_page=False,
skip_full_comment=False,
skip_recommendations=False,
skip_discography=False,
no_store=False):
"""
Params:
database - the sqlit3 database, already populated with basic band info
only_if_not_scraped - only scrape pages that haven't been previously scraped
limit - only scrape this many pages, then exit
offset - start `offset` rows into the band query
reviews_gt - scrape the band page only if the band has strictly greater than
reviews_gt reviews
order_by_reviews - scrape pages in order of decreasing number of reviews,
so that we scrape the more popular bands first. Default is
whatever the optimizer chooses (probably by band_id, but
that's not guaranteed).
order_by_insert_date - scrape pages in order of increasing insert_date (oldest first)
skip_band_page - skip requesting the band's page; also enables skip_full_comment
skip_full_comment - skip requesting the band's full comment/read more text
skip_recommendations - skip requesting the band's recommended/similar bands
skip_discography - skip requesting the band's discography
no_store - don't actually store anything in the database, but still do all the
requests and parsing
TODO
update - deprecate only_if_not_scraped and instead make default behavior to
get pages that haven't been gotten before, and then update pages in order
of delta between insert date and last modified date? Or just insert date?
Something. Maybe make another column in the database?
Do update if insert_date <= some date. Gotta do this query for each thing
that page requested.
How does --limit work with individual queries? Maybe make the query be
an OR of a bunch of things and limit that? band page needs updated OR comment
OR recommendations OR discography? Then limit would be the same for all,
and if something needs many pages it'll get them all? Dunno.. perhaps
this should really be a band_page_scraper, full_comment_scraper,
recommendations_scraper, and discography_scraper.
"""
super().__init__()
if not os.path.isfile(database_filename):
raise ValueError("database file {} doesn't exist".format(database_filename))
self.database_filename = database_filename
self.only_if_not_scraped = bool(only_if_not_scraped)
self.limit = int(limit)
self.offset = int(offset)
self.reviews_gt = int(reviews_gt)
self.order_by_reviews = bool(order_by_reviews)
self.order_by_insert_date = bool(order_by_insert_date)
self.skip_band_page = bool(skip_band_page)
self.skip_full_comment = bool(skip_full_comment) or self.skip_band_page
self.skip_recommendations = bool(skip_recommendations)
self.skip_discography = bool(skip_discography)
self.no_store = bool(no_store)
self.soup_features = 'html5lib'
self.date_re = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
self.added_on_re = re.compile(r'Added on: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')
self.modified_on_re = re.compile(r'Last modified on: (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})')
def run(self):
"""
Call .getBandPage() for each band found in the query
"""
with lite.connect(self.database_filename, isolation_level='IMMEDIATE') as self.connection:
# How many do we need to do?
if self.only_if_not_scraped:
n_do_query = 'select count(band_id) from Bands where modified_date is null'
if self.reviews_gt >= 0:
n_do_query += f' and (select count(*) from Reviews where Reviews.band_id=Bands.band_id) > {self.reviews_gt}'
else:
n_do_query = 'select count(band_id) from Bands'
if self.reviews_gt >= 0:
n_do_query += f' where (select count(*) from Reviews where Reviews.band_id=Bands.band_id) > {self.reviews_gt}'
n_queried = self.connection.execute(n_do_query).fetchall()[0][0]
if self.limit >= 0:
logger.debug('Invoking limit of %d pages', self.limit)
n_do = min(n_queried, self.limit)
else:
n_do = n_queried
if self.offset > 0:
logger.debug('Invoking offset of %d pages', self.offset)
if self.offset + n_do > n_queried:
n_do = max(0, n_queried - self.offset)
logger.info('Gonna scrape %d band pages', n_do)
# Okay, now do the stuff
if self.only_if_not_scraped:
query = 'select band_id,band_url from Bands where modified_date is null'
if self.reviews_gt >= 0:
query += f' and (select count(*) from Reviews where Reviews.band_id=Bands.band_id) > {self.reviews_gt}'
else:
query = 'select band_id,band_url from Bands'
if self.reviews_gt >= 0:
query += f' where (select count(*) from Reviews where Reviews.band_id=Bands.band_id) > {self.reviews_gt}'
if self.order_by_reviews:
query += ' order by (select count(*) from Reviews where Reviews.band_id=Bands.band_id) desc'
elif self.order_by_insert_date:
query += ' order by insert_date asc'
#TODO JMF 10 Mar 2019: this should use lite's parameter substitution
if self.limit >= 0:
query += ' limit {}'.format(self.limit)
if self.offset >= 0:
if self.limit < 0:
query += ' limit {}'.format(n_queried)
query += ' offset {}'.format(self.offset)
logger.debug('query = %s', repr(query))
for band_id,band_url in tqdm.tqdm(self.connection.execute(query), total=n_do):
if self.order_by_reviews:
num_reviews = self.connection.execute('select count(*) from Reviews where band_id=?', (band_id,)).fetchall()[0][0]
logger.debug('num_reviews = %d', num_reviews)
# Request and scrape the band's page
if not self.skip_band_page:
band_dict, artist_dict_list, band_lineup_dict_list, label_dict =\
self.getBandPage(band_id, band_url)
else:
band_dict = {}
artist_dict_list = []
band_lineup_dict_list = []
label_dict = {}
# Request the full band comment/read more text
if not self.skip_full_comment:
comment_body = self.getBandsFullComment(band_id)
if not band_dict: # if we didn't get the band page
band_dict['band_id'] = band_id
band_dict['comment'] = comment_body
# Get similar bands
if not self.skip_recommendations:
similar_band_dict_list = self.getSimilarBands(band_id)
else:
similar_band_dict_list = []
# Get discography
if not self.skip_discography:
album_dict_list = self.getBandsDiscography(band_id)
else:
album_dict_list = []
# Store in database
if not self.no_store:
self.storeInDatabase(band_id,
band_dict=band_dict,
artist_dicts=artist_dict_list,
bandlineup_dicts=band_lineup_dict_list,
label_dict=label_dict,
similarity_dicts=similar_band_dict_list,
album_dicts=album_dict_list)
self.finalDatabaseStuff()
self.close()
def getBandPage(self, band_id, band_url):
"""
GET the band page from metal-archives and parse it using bs4.
This returns dicts and lists of dicts suitable for inserting into
the Bands, Artists, BandLineup, and Labels tables of the database.
Though they appear on the band's page, the following each require a
separate request:
- getting the full band comment
- getting the (full) album list (aka discography)
- getting similar bands/recommendations
"""
# dict mapping table name to a dict (or list of dicts) of column: data.
# We'll populate this dict first, and then store all the data at once,
# storing the modified_date last, since that is the sentinel for a fully
# scraped band page. Since we're gonna be pausing between requests, speed
# isn't a concern so we'll just store one band scrape at a time.
#store_in_db = {}
# get the band page
logger.debug('GET band page for band_id=%d', band_id)
response = self.sessionGet(band_url)
if response.status_code != 200:
raise RuntimeError('Got response status {}, bailing.'.format(response.status_code))
band_soup = bs4.BeautifulSoup(response.text, self.soup_features)
added_on, modified_on, lyrical_themes, label_dict = self.scrapeWhatsOnBandPage(band_id, band_soup)
band_dict = {'band_id': band_id,
'added_date': added_on,
'modified_date': modified_on,
'themes': lyrical_themes,
}
# get the lineup from the band page
artists, band_lineup_entries = self.getBandsLineup(band_id, band_soup)
return band_dict, artists, band_lineup_entries, label_dict
def scrapeWhatsOnBandPage(self, band_id, soup):
"""
Scrape what's available on the band page without making any more requests
These things are
- the lyrical themes
- the added date
- the modified date
- the band's label (returned as a dict of label_id, label, label_url)
Getting the full comment will require a request
"""
# get the band name; check that ID in url matches band_id
band_name_list = soup.find_all('h1', 'band_name')
if len(band_name_list) != 1:
raise RuntimeError('Got bad band_name_list = {} for band_id={}'.format(band_name_list, band_id))
band_name_tag = band_name_list[0]
band_id_check = get_band_id_from_band_url(band_name_tag.a.get('href'))
if band_id != band_id_check:
raise RuntimeError('Got incorrect band_id={} from band name href (band_id={} in db)'.format(
band_id_check, band_id))
# Get the lyrical themes
stats_div = soup.find('div', {'id': 'band_stats'})
right_stuff = stats_div.find('dl', 'float_right')
dd_list = right_stuff.find_all('dd')
lyrical_themes = dd_list[1].text
# Get the current label
label_tag = dd_list[2].a
if label_tag:
label = label_tag.text
label_url = label_tag.get('href')
label_id = get_label_id_from_label_url(label_url)
label_dict = {'label_id': label_id,
'label': label,
'label_url': label_url}
else: # this label doesn't have a page on metal-archives
label_dict = {}
# Get the added/modified dates
audit_div = soup.find('div', {'id': 'auditTrail'})
td_list = audit_div.find_all('td')
added_on_td = td_list[2]
added_on_match = re.match(self.added_on_re, added_on_td.text)
if not added_on_match:
raise RuntimeError("Didn't find added on date for band_id={}".format(band_id))
added_on = added_on_match.group(1)
modified_on_td = td_list[3]
modified_on_match = re.match(self.modified_on_re, modified_on_td.text)
if not modified_on_match:
raise RuntimeError("Didn't find modified on date for band_id={}".format(band_id))
modified_on = modified_on_match.group(1)
return added_on, modified_on, lyrical_themes, label_dict
def getBandsFullComment(self, band_id):
"""
GET the band's "read-more"/full comment.
Return the <body> (with <body> tags removed).
"""
logger.debug('GET band read-more page for band_id=%d', band_id)
read_more_url = 'https://www.metal-archives.com/band/read-more/id/' + str(band_id)
response = self.sessionGet(read_more_url)
if response.status_code != 200:
raise RuntimeError('Got response status {}, bailing.'.format(response.status_code))
soup = bs4.BeautifulSoup(response.text, self.soup_features)
return ''.join(map(str, soup.body.children))
def getBandsLineup(self, band_id, soup):
"""
Parse the band's complete members table to get basic info about
artists and create dicts with info for storing into the Artists and BandLineup
tables.
Split-up and name-changed bands don't have a complete lineup, but instead show
the current lineup under the title of "Last Known Lineup". In these cases, we'll
use the current lineup.
"""
logger.debug('Scraping band page for members')
# figure out which tabs we have
band_members_tag = soup.find('div', {'id': 'band_members'})
tab_links = band_members_tag.ul.find_all('li')
have_complete = False
have_current = False
current_tab_name = ''
for link in tab_links:
href = link.a.get('href')
if href == '#band_tab_members_all':
have_complete = True
elif href == '#band_tab_members_current':
have_current = True
current_tab_name = link.a.text
if have_complete:
logger.debug('Got complete members table')
complete_members_tag = band_members_tag.find('div', {'id': 'band_tab_members_all'})
return self.parseMembersTable(band_id, complete_members_tag.table)
logger.debug('No complete members table (e.g. if band has no past/live members, '
'is split-up, on-hold, or changed name')
if have_current:
logger.debug('Got a current members table...')
current_members_tag = band_members_tag.find('div', {'id': 'band_tab_members_current'})
if current_tab_name == 'Current lineup':
logger.debug("it's the band's current lineup")
member_status = MemberStatus(True,False,False,False,False,False)
elif current_tab_name == 'Last known lineup':
logger.debug("it's the band's last known lineup (split-up, on-hold, or changed name)")
member_status = MemberStatus(False,False,False,False,True,False)
else:
raise NotImplementedError(f"Didn't handle current_tab_name={current_tab_name}")
return self.parseMembersTable(band_id, current_members_tag.table, member_status)
raise NotImplementedError('No complete or current members table...')
def parseMembersTable(self, band_id, table, member_status=None):
"""
Parse a band's member table, which could be the complete lineup or current/last known lineup.
"""
def get_class(row):
classes = row.get('class')
if not classes:
raise RuntimeError('Band member row without a class: row={}'.format(row))
if len(classes) != 1:
raise RuntimeError('Band member row with zero or 2+ classes: row={}'.format(row))
return str(classes[0])
members = []
if not table: # members table is empty
return [], []
rows = table.tbody.find_all('tr')
i = 0
while i < len(rows):
row = rows[i]
c = get_class(row)
if c == 'lineupHeaders':
text = str(row.td.text).strip().rstrip()
text = re.sub(r'\s+', ' ', text)
if text == 'Current':
member_status = MemberStatus(True,False,False,False,False,False)
elif text == 'Current (Live)':
member_status = MemberStatus(False,True,False,False,False,False)
elif text == 'Past':
member_status = MemberStatus(False,False,True,False,False,False)
elif text == 'Past (Live)':
member_status = MemberStatus(False,False,False,True,False,False)
elif text == 'Last known':
member_status = MemberStatus(False,False,False,False,True,False)
elif text == 'Last known (Live)':
member_status = MemberStatus(False,False,False,False,False,True)
else:
raise NotImplementedError('Unhandled lineupHeaders text={}'.format(text))
i += 1
continue
elif c == 'lineupRow':
if member_status is None:
raise NotImplementedError("Didn't properly handle getting member status "
"from member tab name")
artist_tag = row.td
a_tags = artist_tag.find_all('a')
if len(a_tags) == 0: # no artist page for this memer
i += 1
if i < len(rows):
if get_class(rows[i]) == 'lineupBandsRow': # skip the see also if it exists
i += 1
continue
elif len(a_tags) > 1:
raise NotImplementedError("Unhandled case... multiple artists pages?")
a_tag = a_tags[0]
artist = a_tag.text
artist_url = a_tag.get('href')
artist_id = get_artist_id_from_artist_url(artist_url)
member = {'artist_id': artist_id,
'artist': artist,
'artist_url': artist_url,
'status': member_status,
}
i += 1
if i < len(rows):
if get_class(rows[i]) == 'lineupBandsRow':
# this member has a see also row
# We likely don't need to do anything with this though,
# since the BandLineup table will show these connections
# once the table is built up.
i += 1
members.append(member)
continue
elif c == 'lineupBandsRow':
raise RuntimeError('Bug in implementation of member list iteration')
else:
raise NotImplementedError("Didn't code up handling for band member row class={}".format(c))
raise RuntimeError('Bug in implementation of member list iteration')
# Check if we have the same artist twice in the members list
members_dict = {}
for i, member in enumerate(members):
artist_id = member['artist_id']
if artist_id in members_dict:
member0 = members_dict[artist_id]
status0 = member0['status']
status1 = member['status']
status = MemberStatus(current=status0.current or status1.current,
current_live=status0.current_live or status1.current_live,
past=status0.past or status1.past,
past_live=status0.past_live or status1.past_live,
last_known=status0.last_known or status1.last_known,
last_known_live=status0.last_known_live or status1.last_known_live,
)
member0['status'] = status
else:
members_dict[artist_id] = member
# Now build a list of artists and band-lineup suitable for entry into the db
artists = []
band_lineup_entries = []
for member in members_dict.values():
artists.append({'artist_id': member['artist_id'],
'artist': member['artist'],
'artist_url': member['artist_url']})
status = member['status']
band_lineup_entries.append({'band_id': band_id,
'artist_id': member['artist_id'],
'current_member': status.current,
'current_live_member': status.current_live,
'past_member': status.past,
'past_live_member': status.past_live,
'last_known_member': status.last_known,})
return artists, band_lineup_entries
def getSimilarBands(self, band_id):
"""
GET similar bands via /band/ajax-recommendations/id/ and parse the table into
a list of dicts suitable for storage into Similarities table.
"""
logger.debug('GET band recommendations for band_id=%d', band_id)
similar_bands_url = 'https://www.metal-archives.com/band/ajax-recommendations/id/' + str(band_id)
params = {'showMoreSimilar': 1}
response = self.sessionGet(similar_bands_url, params=params)
if response.status_code != 200:
raise RuntimeError('Got response status {}, bailing.'.format(response.status_code))
soup = bs4.BeautifulSoup(response.text, self.soup_features)
table = soup.find('table', {'id': 'artist_list'})
recommendations = []
for row in table.tbody.find_all('tr'):
cells = row.find_all('td')
if len(cells) < 4:
# end of table, verify, then break
tag = cells[0]
if tag.get('id') == 'show_more' or tag.get('id') == 'no_artists':
break
raise NotImplementedError("Incomplete handling of shorter cells")
artist_tag = cells[0]
similar_to_id = get_band_id_from_band_url(artist_tag.a.get('href'))
score = int(cells[3].text)
recommendations.append({'band_id': band_id,
'similar_to_id': similar_to_id,
'score': score})
return recommendations
def getBandsDiscography(self, band_id):
"""
GET a band's discography via /band/discography/id/{band_id}/tab/all and parse the table into
a list of dicts suitable for storage into Albums table.
"""
logger.debug('GET band discography for band_id=%d', band_id)
discog_url = f'https://www.metal-archives.com/band/discography/id/{band_id}/tab/all'
response = self.sessionGet(discog_url)
if response.status_code != 200:
raise RuntimeError('Got response status {}, bailing.'.format(response.status_code))
soup = bs4.BeautifulSoup(response.text, self.soup_features)
table = soup.find('table', {'class': 'display discog'})
if not table:
raise RuntimeError("Didn't get the discography table")
albums = []
for row in table.tbody.find_all('tr'):
cells = row.find_all('td')
album_tag = cells[0].a
album = album_tag.text
album_url = album_tag.get('href')
album_id = get_album_id_from_album_url(album_url)
type_tag = cells[1]
type_str = type_tag.text
year_tag = cells[2]
year = int(year_tag.text)
albums.append({'band_id': band_id,
'album_id': album_id,
'album': album,
'album_url': album_url,
'type': type_str,
'release_date': year,
})
return albums
def storeInDatabase(self,
band_id,
band_dict=None,
artist_dicts=[],
bandlineup_dicts=[],
label_dict=None,
similarity_dicts=[],
album_dicts=[],
):
"""
Store things in the database.
"""
def make_str(keys):
return ','.join(keys)
#def make_qmark_eq(keys):
# return ','.join(map(lambda k: f"{k}=?", keys))
def make_named(keys):
return ','.join(map(lambda k: f":{k}", keys))
def make_named_eq(keys):
return ','.join(map(lambda k: f"{k}=:{k}", keys))
def where(ids):
return 'where ' + ' and '.join(map(lambda k: f"{k}=:{k}", ids))
def do_dicts_stuff(dicts, table, ids):
for d in dicts:
query = f"select {make_str(ids)} from {table} {where(ids)}"
#logger.debug(f'select {table} query: "%s"', query)
cur.execute(query, {k:v for k,v in d.items() if k in ids})
found_id = cur.fetchall()
if found_id:
query = f"update {table} set {make_named_eq(d.keys())} {where(ids)}"
else:
query = f"insert into {table} ({make_str(d.keys())},insert_date) values ({make_named(d.keys())},datetime('now'))"
#logger.debug(f'insert/update {table} query: "%s"', query)
cur.execute(query, d)
cur = self.connection.cursor()
if band_dict:
table = 'Bands'
ids = ('band_id',)
do_dicts_stuff((band_dict,), table, ids)
if artist_dicts:
table = 'Artists'
ids = ('artist_id',)
do_dicts_stuff(artist_dicts, table, ids)
if label_dict:
table = 'Labels'
ids = ('label_id',)
do_dicts_stuff((label_dict,), table, ids)
if bandlineup_dicts:
table = 'BandLineup'
ids = ('band_id', 'artist_id')
do_dicts_stuff(bandlineup_dicts, table, ids)
if similarity_dicts:
table = 'Similarities'
ids = ('band_id', 'similar_to_id')
do_dicts_stuff(similarity_dicts, table, ids)
if album_dicts:
table = 'Albums'
ids = ('album_id', 'band_id')
do_dicts_stuff(album_dicts, table, ids)
cur.close()
self.connection.commit()
def finalDatabaseStuff(self):
"""
Do some final things with the database, like
- populate the BandLabel table
"""
logger.warning('final db stuff is not implemented')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Scrape the band page for each band in the db,'
' also making a few other GET requests and scrapes.')
parser.add_argument('database', type=str,
help='Filename of sqlite3 database; must already exist '
'and have basic band info')
parser.add_argument('--only-if-not-scraped', action='store_true',
help="Only scrape pages that haven't previously been scraped "
"(as witnessed by `modified_date` being NULL, comment being null, etc.)")
parser.add_argument('--limit', type=int, default=-1,
help='After scraping --limit pages, exit; useful for dev/test')
parser.add_argument('--offset', type=int, default=0,
help='Start --offset rows into the band query; useful for dev/test')
parser.add_argument('--reviews-gt', type=int, default=0,
help='Scrape pages only if that have strictly greater than --reviews-gt reviews')
parser.add_argument('--order-by-reviews', action='store_true',
help='Scrape pages in order of decreasing number of reviews '
'(more popular bands first)')
parser.add_argument('--order-by-insert-date', action='store_true',
help='Scrape pages in order of increasing insert date '
'(older entries first)')
parser.add_argument('--skip-band-page', action='store_true',
help="Skip requesting the band's page; also enables --skip-full-comment")
parser.add_argument('--skip-full-comment', action='store_true',
help="Skip requesting the band's full comment/read more")
parser.add_argument('--skip-recommendations', action='store_true',
help="Skip requesting the band's recommended/similar bands")
parser.add_argument('--skip-discography', action='store_true',
help="Skip requesting the band's discography")
parser.add_argument('--no-store', action='store_true',
help="Don't actually store anything in the database")
#subparsers?
parser.add_argument('--logging-level', type=int, default=logging.WARNING,
help="Set the logging level")
args = parser.parse_args()
logging.basicConfig(stream=tqdmForLogging, level=args.logging_level)
scraper = BandPageScraper(args.database,
only_if_not_scraped=args.only_if_not_scraped,
limit=args.limit,
offset=args.offset,
reviews_gt=args.reviews_gt,
order_by_reviews=args.order_by_reviews,
order_by_insert_date=args.order_by_insert_date,
skip_band_page=args.skip_band_page,
skip_full_comment=args.skip_full_comment,
skip_recommendations=args.skip_recommendations,
skip_discography=args.skip_discography,
no_store=args.no_store,
)
scraper.run()