forked from repology/repology-updater
-
Notifications
You must be signed in to change notification settings - Fork 0
/
repology-linkchecker.py
executable file
·254 lines (196 loc) · 9.16 KB
/
repology-linkchecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
#
# Copyright (C) 2016-2017 Dmitry Marakasov <amdmi3@amdmi3.ru>
#
# This file is part of repology
#
# repology is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# repology is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with repology. If not, see <http://www.gnu.org/licenses/>.
import argparse
import datetime
import multiprocessing
import os
import re
import socket
import time
import urllib.parse
import requests
from repology.config import config
from repology.database import Database
from repology.logger import FileLogger, StderrLogger
from repology.querymgr import QueryManager
USER_AGENT = 'repology-linkchecker/0 (+{}/bots)'.format(config['REPOLOGY_HOME'])
def GetHTTPLinkStatus(url, timeout):
try:
response = requests.head(url, allow_redirects=True, headers={'User-Agent': USER_AGENT}, timeout=timeout)
# fallback to GET
if response.status_code != 200:
response = requests.get(url, allow_redirects=True, headers={'User-Agent': USER_AGENT}, timeout=timeout)
redirect = None
size = None
location = None
# handle redirect chain
if response.history:
redirect = response.history[0].status_code
# resolve permanent (and only permament!) redirect chain
for h in response.history:
if h.status_code == 301:
location = urllib.parse.urljoin(h.url, h.headers.get('location'))
# handle size
if response.status_code == 200:
content_length = response.headers.get('content-length')
if content_length:
size = int(content_length)
return (url, response.status_code, redirect, size, location)
except KeyboardInterrupt:
raise
except requests.Timeout:
return (url, Database.linkcheck_status_timeout, None, None, None)
except requests.TooManyRedirects:
return (url, Database.linkcheck_status_too_many_redirects, None, None, None)
except requests.ConnectionError:
# check for DNS error additionally
try:
parsed = urllib.parse.urlparse(url)
socket.gethostbyname(parsed.hostname)
except socket.gaierror:
return (url, Database.linkcheck_status_dns_error, None, None, None)
except:
pass
return (url, Database.linkcheck_status_cannot_connect, None, None, None)
except requests.exceptions.InvalidURL:
return (url, Database.linkcheck_status_invalid_url, None, None, None)
except:
return (url, Database.linkcheck_status_unknown_error, None, None, None)
def GetLinkStatuses(urls, delay, timeout):
results = []
prev_host = None
for url in urls:
# XXX: add support for gentoo mirrors, skip for now
if not url.startswith('http://') and not url.startswith('https://'):
continue
host = urllib.parse.urlparse(url).hostname
if host and host == prev_host:
time.sleep(delay)
results.append(GetHTTPLinkStatus(url, timeout))
prev_host = host
return results
def LinkProcessingWorker(readqueue, writequeue, workerid, options, logger):
logger = logger.GetPrefixed('worker{}: '.format(workerid))
logger.Log('Worker spawned')
def Slicer(pack):
for i in range(0, len(pack), options.packsize):
yield pack[i:i + options.packsize]
while True:
pack = readqueue.get()
if pack is None:
logger.Log('Worker exiting')
return
logger.Log('Processing {} url(s) ({} .. {})'.format(len(pack), pack[0], pack[-1]))
for subpack in Slicer(pack):
writequeue.put(GetLinkStatuses(subpack, delay=options.delay, timeout=options.timeout))
logger.Log('Done processing {} url(s) ({} .. {})'.format(len(pack), pack[0], pack[-1]))
def LinkUpdatingWorker(queue, options, querymgr, logger):
database = Database(options.dsn, querymgr, readonly=False, application_name='repology-linkchecker/writer')
logger = logger.GetPrefixed('writer: ')
logger.Log('Writer spawned')
while True:
pack = queue.get()
if pack is None:
logger.Log('Writer exiting')
return
for url, status, redirect, size, location in pack:
database.update_link_status(url=url, status=status, redirect=redirect, size=size, location=location)
database.commit()
logger.Log('Updated {} url(s) ({} .. {})'.format(len(pack), pack[0][0], pack[-1][0]))
def ParseArguments():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dsn', default=config['DSN'], help='database connection params')
parser.add_argument('--logfile', help='path to log file (log to stderr by default)')
parser.add_argument('--sql-dir', default=config['SQL_DIR'], help='path to directory with sql queries')
parser.add_argument('--timeout', type=float, default=60.0, help='timeout for link requests in seconds')
parser.add_argument('--delay', type=float, default=3.0, help='delay between requests to one host')
parser.add_argument('--age', type=int, default=0, help='min age for recheck in days')
parser.add_argument('--packsize', type=int, default=128, help='pack size for link processing')
parser.add_argument('--maxpacksize', type=int, help='max pack size for link processing (useful to skip large hosts)')
parser.add_argument('--jobs', type=int, default=1, help='number of parallel jobs')
parser.add_argument('--unchecked', action='store_true', help='only process unchecked (newly discovered) links')
parser.add_argument('--checked', action='store_true', help='only process old (already checked) links')
parser.add_argument('--failed', action='store_true', help='only process links that were checked and failed')
parser.add_argument('--succeeded', action='store_true', help='only process links that were checked and failed')
parser.add_argument('--prefix', help='only process links with specified prefix')
return parser.parse_args()
def Main():
options = ParseArguments()
logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
querymgr = QueryManager(options.sql_dir)
database = Database(options.dsn, querymgr, readonly=True, autocommit=True, application_name='repology-linkchecker/reader')
readqueue = multiprocessing.Queue(10)
writequeue = multiprocessing.Queue(10)
writer = multiprocessing.Process(target=LinkUpdatingWorker, args=(writequeue, options, querymgr, logger))
writer.start()
processpool = [multiprocessing.Process(target=LinkProcessingWorker, args=(readqueue, writequeue, i, options, logger)) for i in range(options.jobs)]
for process in processpool:
process.start()
# base logger already passed to workers, may append prefix here
logger = logger.GetPrefixed('master: ')
prev_url = None
while True:
# Get pack of links
logger.Log('Requesting pack of urls')
urls = database.get_links_for_check(
after=prev_url,
prefix=options.prefix, # no limit by default
limit=options.packsize,
recheck_age=datetime.timedelta(seconds=options.age * 60 * 60 * 24),
unchecked_only=options.unchecked,
checked_only=options.checked,
failed_only=options.failed,
succeeded_only=options.succeeded
)
if not urls:
logger.Log(' No more urls to process')
break
# Get another pack of urls with the last hostname to ensure
# that all urls for one hostname get into a same large pack
match = re.match('([a-z]+://[^/]+/)', urls[-1])
if match:
urls += database.get_links_for_check(
after=urls[-1],
prefix=match.group(1),
recheck_age=datetime.timedelta(seconds=options.age * 60 * 60 * 24),
unchecked_only=options.unchecked,
checked_only=options.checked,
failed_only=options.failed,
succeeded_only=options.succeeded
)
# Process
if options.maxpacksize and len(urls) > options.maxpacksize:
logger.Log('Skipping {} urls ({}..{}), exceeds max pack size'.format(len(urls), urls[0], urls[-1]))
else:
readqueue.put(urls)
logger.Log('Enqueued {} urls ({}..{})'.format(len(urls), urls[0], urls[-1]))
prev_url = urls[-1]
logger.Log('Waiting for child processes to exit')
# close workers
for process in processpool:
readqueue.put(None)
for process in processpool:
process.join()
# close writer
writequeue.put(None)
writer.join()
logger.Log('Done')
return 0
if __name__ == '__main__':
os.sys.exit(Main())