Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for sitemap index #65

Merged
merged 2 commits into from
Oct 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ $ python3 main.py --domain https://blog.lesite.us --num-workers 4
$ python3 main.py --domain https://blog.lesite.us --auth
```

#### Output sitemap index file
***Sitemaps with over 50,000 URLs should be split into an index file that points to sitemap files that each contain 50,000 URLs or fewer. Outputting as an index requires specifying an output file. An index will only be output if a crawl has more than 50,000 URLs:***
```
$ python3 main.py --domain https://blog.lesite.us --as-index --output sitemap.xml
```

## Docker usage

#### Build the Docker image:
Expand Down
5 changes: 5 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
"""
xml_footer = "</urlset>"

sitemapindex_header = """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
sitemapindex_footer = "</sitemapindex>"

crawler_user_agent = 'Sitemap crawler'

# if used with --auth you have to provide username and password here for basic auth
Expand Down
91 changes: 82 additions & 9 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import concurrent.futures
import base64
from copy import copy
import math

import config
import logging
Expand All @@ -21,6 +22,8 @@ class IllegalArgumentError(ValueError):

class Crawler:

MAX_URLS_PER_SITEMAP = 50000

# Variables
parserobots = False
output = None
Expand All @@ -37,6 +40,7 @@ class Crawler:
auth = False

urls_to_crawl = set([])
url_strings_to_output = []
crawled_or_crawling = set([])
excluded = set([])

Expand All @@ -61,7 +65,7 @@ class Crawler:

def __init__(self, num_workers=1, parserobots=False, output=None,
report=False ,domain="", exclude=[], skipext=[], drop=[],
debug=False, verbose=False, images=False, auth=False):
debug=False, verbose=False, images=False, auth=False, as_index=False):
self.num_workers = num_workers
self.parserobots = parserobots
self.output = output
Expand All @@ -73,7 +77,8 @@ def __init__(self, num_workers=1, parserobots=False, output=None,
self.debug = debug
self.verbose = verbose
self.images = images
self.auth = auth
self.auth = auth
self.as_index = as_index

if self.debug:
log_level = logging.DEBUG
Expand All @@ -85,6 +90,7 @@ def __init__(self, num_workers=1, parserobots=False, output=None,
logging.basicConfig(level=log_level)

self.urls_to_crawl = {self.clean_link(domain)}
self.url_strings_to_output = []
self.num_crawled = 0

if num_workers <= 0:
Expand All @@ -104,10 +110,11 @@ def __init__(self, num_workers=1, parserobots=False, output=None,
except:
logging.error ("Output file not available.")
exit(255)
elif self.as_index:
logging.error("When specifying an index file as an output option, you must include an output file name")
exit(255)

def run(self):
print(config.xml_header, file=self.output_file)

if self.parserobots:
self.check_robots()

Expand All @@ -129,7 +136,8 @@ def run(self):

logging.info("Crawling has reached end of all found links")

print (config.xml_footer, file=self.output_file)
self.write_sitemap_output()



async def crawl_all_pending_urls(self, executor):
Expand Down Expand Up @@ -260,10 +268,8 @@ def __crawl(self, current_url):
lastmod = ""
if date:
lastmod = "<lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod>"

print ("<url><loc>"+self.htmlspecialchars(url.geturl())+"</loc>" + lastmod + image_list + "</url>", file=self.output_file)
if self.output_file:
self.output_file.flush()
url_string = "<url><loc>"+self.htmlspecialchars(url.geturl())+"</loc>" + lastmod + image_list + "</url>"
self.url_strings_to_output.append(url_string)

# Found links
links = self.linkregex.findall(msg)
Expand Down Expand Up @@ -333,6 +339,73 @@ def __crawl(self, current_url):

self.urls_to_crawl.add(link)

def write_sitemap_output(self):
are_multiple_sitemap_files_required = \
len(self.url_strings_to_output) > self.MAX_URLS_PER_SITEMAP

# When there are more than 50,000 URLs, the sitemap specification says we have
# to split the sitemap into multiple files using an index file that points to the
# location of each sitemap file. For now, we require the caller to explicitly
# specify they want to create an index, even if there are more than 50,000 URLs,
# to maintain backward compatibility.
#
# See specification here:
# https://support.google.com/webmasters/answer/183668?hl=en
if are_multiple_sitemap_files_required and self.as_index:
self.write_index_and_sitemap_files()
else:
self.write_single_sitemap()

def write_single_sitemap(self):
self.write_sitemap_file(self.output_file, self.url_strings_to_output)

def write_index_and_sitemap_files(self):
sitemap_index_filename, sitemap_index_extension = os.path.splitext(self.output)

num_sitemap_files = math.ceil(len(self.url_strings_to_output) / self.MAX_URLS_PER_SITEMAP)
sitemap_filenames = []
for i in range(0, num_sitemap_files):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably doesn't matter, but I notice that in the Google documentation, they 1-index the sitemaps whereas they're 0-indexed there. Curious to hear your thoughts there.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its really matter ?

Copy link
Contributor

@Garrett-R Garrett-R Sep 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'm pretty certain it doesn't matter, so either way is fine I reckon.

# name the individual sitemap files based on the name of the index file
sitemap_filename = sitemap_index_filename + '-' + str(i) + sitemap_index_extension
sitemap_filenames.append(sitemap_filename)

self.write_sitemap_index(sitemap_filenames)

for i, sitemap_filename in enumerate(sitemap_filenames):
self.write_subset_of_urls_to_sitemap(sitemap_filename, i * self.MAX_URLS_PER_SITEMAP)

def write_sitemap_index(self, sitemap_filenames):
sitemap_index_file = self.output_file
print(config.sitemapindex_header, file=sitemap_index_file)
for sitemap_filename in sitemap_filenames:
sitemap_url = urlunsplit([self.scheme, self.target_domain, sitemap_filename, '', ''])
print("<sitemap><loc>" + sitemap_url + "</loc>""</sitemap>", file=sitemap_index_file)
print(config.sitemapindex_footer, file=sitemap_index_file)

def write_subset_of_urls_to_sitemap(self, filename, index):
# Writes a maximum of self.MAX_URLS_PER_SITEMAP urls to a sitemap file
#
# filename: name of the file to write the sitemap to
# index: zero-based index from which to start writing url strings contained in
# self.url_strings_to_output
try:
with open(filename, 'w') as sitemap_file:
start_index = index
end_index = (index + self.MAX_URLS_PER_SITEMAP)
sitemap_url_strings = self.url_strings_to_output[start_index:end_index]
self.write_sitemap_file(sitemap_file, sitemap_url_strings)
except:
logging.error("Could not open sitemap file that is part of index.")
exit(255)

@staticmethod
def write_sitemap_file(file, url_strings):
print(config.xml_header, file=file)

for url_string in url_strings:
print (url_string, file=file)

print (config.xml_footer, file=file)

def clean_link(self, link):
parts = list(urlsplit(link))
Expand Down
1 change: 1 addition & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
parser.add_argument('--auth', action="store_true", default=False, help="Enable basic authorisation while crawling")
parser.add_argument('-v', '--verbose', action="store_true", help="Enable verbose output")
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--as-index', action="store_true", default=False, required=False, help="Outputs sitemap as index and multiple sitemap files if crawl results in more than 50,000 links (uses filename in --output as name of index file)")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
parser.add_argument('--report', action="store_true", default=False, required=False, help="Display a report")
Expand Down