-
Notifications
You must be signed in to change notification settings - Fork 0
/
getsoup.py
122 lines (105 loc) · 3.66 KB
/
getsoup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from bs4 import BeautifulSoup
from bs4.element import Comment
from random import randint
from time import sleep
from urllib.error import HTTPError, URLError
import requests
import difflib
import re
import os
# global list of pages crawled
urls=[]
# function to recursively find pages from the homepage
# and scrape each page
def scrape(site):
# sleeping for random amounts of time to prevent getting banned
#sleep(randint(1, 100))
print(site)
# getting the request from url
try:
response = requests.get(site)
response.raise_for_status()
except HTTPError as hpe:
print("Error: Website not present.", hpe)
return
except URLError as ue:
print("Error: Cannot find server.", ue)
return
# converting the text
soup = BeautifulSoup(response.text, 'lxml')
if soup is None:
return
try:
getinfo(soup)
except AttributeError as ae:
print("Error: Attribute/s not found.", ae)
anchors = soup.find_all('a')
for i in range(len(anchors)):
ele = anchors[i]
if not ele.has_attr('href'):
continue
link = ele.attrs['href']
if link.startswith('//'):
site = 'https:' + link
elif link.startswith('http'):
site = link
if "animalcrossing." in site and site not in urls:
urls.append(site)
# recursive call to scrape
try:
scrape(site)
except KeyboardInterrupt:
# user wants to stop
print("Keyboard Interrupt: Exiting program.")
os._exit(0)
except Exception as exception:
print("Error:", exception)
# exit if there is an error
continue
return
# function to get text content from a page and update the corresponding file
# if the file has changed
def getinfo(soup):
if soup.title == None:
actitle = 'Untitled'+str(randint(1,500))
else:
actitle = soup.title.contents[0]
actitle = re.sub('[^a-zA-Z]+', '-', actitle)
# get summary from description in the meta tag for a Wiki page
summary = 'Summary not available.'
for s in soup.find_all('meta', {'name' : 'description', 'content':True}):
summary = s.attrs['content']
if not summary.startswith('#REDIRECT'):
break
else:
summary = 'Summary not available.'
# The file is saved to a folder, crawled-pages
acfilepath = "crawled-pages/" + actitle + ".txt"
acfile = open(acfilepath, 'w+', encoding='utf-8')
# get content of the Wiki article
texts = soup.findAll(string = True)
visible_texts = filter(tag_visible, texts)
content = u" ".join(t.strip() for t in visible_texts)
# rewrite file if there are changes
newcontent = actitle + '\n\n' + summary + '\n\n' + content
oldcontent = acfile.read()
difflist = [li for li in difflib.ndiff(oldcontent, newcontent) if li[0] != ' ']
if len(difflist) > 0:
acfile.write(newcontent)
acfile.close()
# filter function to get visible text only
def tag_visible(element):
blacklist = ['style', 'script', 'head', 'title', 'meta', '[document]', 'noscript', 'header', 'html', 'input']
if element.parent.name in blacklist:
return False
if isinstance(element, Comment):
return False
return True
# main function
if __name__ =="__main__":
# get the website to be scraped
homepage="https://animalcrossing.fandom.com/wiki/Animal_Crossing_Wiki"
urls.append(homepage)
# calling scraper function
scrape(homepage)
print("Finished scraping Animal Crossing Wiki!!")