Skip to content

Commit

Permalink
Merge pull request #129 from nanos/cache-robots
Browse files Browse the repository at this point in the history
Cache robots.txt for each run of the script, to reduce load on the server
  • Loading branch information
nanos authored Jun 25, 2024
2 parents ac8044d + 7b9896b commit dec718d
Showing 1 changed file with 19 additions and 9 deletions.
28 changes: 19 additions & 9 deletions find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,16 +1009,25 @@ def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)

try:
# We are getting the robots.txt manually from here, because otherwise
robotsTxt = get(robots, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403):
return False
elif robotsTxt.status_code != 200:
if robots in ROBOTS_TXT:
if isinstance(ROBOTS_TXT[robots], bool):
return ROBOTS_TXT[robots]
else:
robotsTxt = ROBOTS_TXT[robots]
else:
try:
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
robotsTxt = get(robots, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403):
ROBOTS_TXT[robots] = False
return False
elif robotsTxt.status_code != 200:
ROBOTS_TXT[robots] = True
return True
robotsTxt = robotsTxt.text
ROBOTS_TXT[robots] = robotsTxt
except Exception as ex:
return True
robotsTxt = robotsTxt.text
except Exception as ex:
return True

robotParser = urllib.robotparser.RobotFileParser()
robotParser.parse(robotsTxt.splitlines())
Expand Down Expand Up @@ -1394,6 +1403,7 @@ def set_server_apis(server):
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')

ROBOTS_TXT = {}

seen_urls = OrderedSet([])
if os.path.exists(SEEN_URLS_FILE):
Expand Down

0 comments on commit dec718d

Please sign in to comment.