Skip to content

Commit

Permalink
Merge pull request #130 from nanos/cache-robots-on-disk
Browse files Browse the repository at this point in the history
Cache robots.txt for 24 hours on disk to reduce load on servers
  • Loading branch information
nanos authored Jun 27, 2024
2 parents 009fbe5 + 40b624a commit e0faafb
Showing 1 changed file with 34 additions and 13 deletions.
47 changes: 34 additions & 13 deletions find_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
logger = logging.getLogger("FediFetcher")
robotParser = urllib.robotparser.RobotFileParser()

VERSION = "7.1.1"
VERSION = "7.1.2"

argparser=argparse.ArgumentParser()

Expand Down Expand Up @@ -1028,19 +1028,31 @@ def can_fetch(user_agent, url):
else:
robotsTxt = ROBOTS_TXT[robots]
else:
try:
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403):
ROBOTS_TXT[robots] = False
return False
elif robotsTxt.status_code != 200:
ROBOTS_TXT[robots] = True
return True
robotsTxt = robotsTxt.text
robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
if os.path.exists(robotsCachePath):
with open(robotsCachePath, "r", encoding="utf-8") as f:
logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
robotsTxt = f.read()
ROBOTS_TXT[robots] = robotsTxt
except Exception as ex:
return True

else:
try:
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403):
ROBOTS_TXT[robots] = False
return False
elif robotsTxt.status_code != 200:
ROBOTS_TXT[robots] = True
return True
robotsTxt = robotsTxt.text
ROBOTS_TXT[robots] = robotsTxt

with open(robotsCachePath, "w", encoding="utf-8") as f:
f.write(robotsTxt)

except Exception as ex:
return True

robotParser = urllib.robotparser.RobotFileParser()
robotParser.parse(robotsTxt.splitlines())
Expand Down Expand Up @@ -1480,6 +1492,15 @@ def set_server_apis(server):
else:
seen_hosts = ServerList({})

# Delete any old robots.txt files so we can re-download them
for file_name in os.listdir(arguments.state_dir):
file_path = os.path.join(arguments.state_dir,file_name)
if file_name.startswith('robots-') and os.path.isfile(file_path):
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
logger.debug(f"Removing cached robots.txt file {file_name}")
os.remove(file_path)


if(isinstance(arguments.access_token, str)):
setattr(arguments, 'access_token', [arguments.access_token])

Expand Down

0 comments on commit e0faafb

Please sign in to comment.