-
Notifications
You must be signed in to change notification settings - Fork 2
/
find_links.py
58 lines (45 loc) · 1.76 KB
/
find_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
from bs4 import BeautifulSoup as Soup
import os
import csv
# Constants for the attributes to be extracted from the sitemap.
ATTRS = ["loc", "lastmod", "priority"]
def parse_sitemap(url, csv_filename="urls.csv"):
"""Parse the sitemap at the given URL and append the data to a CSV file."""
# Return False if the URL is not provided.
if not url:
return False
# Attempt to get the content from the URL.
response = requests.get(url)
# Return False if the response status code is not 200 (OK).
if response.status_code != 200:
return False
# Parse the XML content of the response.
soup = Soup(response.content, "xml")
# Recursively parse nested sitemaps.
for sitemap in soup.find_all("sitemap"):
loc = sitemap.find("loc").text
parse_sitemap(loc, csv_filename)
# Define the root directory for saving the CSV file.
root = os.path.dirname(os.path.abspath(__file__))
# Find all URL entries in the sitemap.
urls = soup.find_all("url")
rows = []
for url in urls:
row = []
for attr in ATTRS:
found_attr = url.find(attr)
# Use "n/a" if the attribute is not found, otherwise get its text.
row.append(found_attr.text if found_attr else "n/a")
rows.append(row)
# Check if the file already exists
file_exists = os.path.isfile(os.path.join(root, csv_filename))
# Append the data to the CSV file.
with open(os.path.join(root, csv_filename), "a+", newline="") as csvfile:
writer = csv.writer(csvfile)
# Write the header only if the file doesn't exist
if not file_exists:
writer.writerow(ATTRS)
writer.writerows(rows)
# Example usage
parse_sitemap("https://yoursite/sitemap.xml")