-
Notifications
You must be signed in to change notification settings - Fork 0
/
letssee.py
30 lines (25 loc) · 1.04 KB
/
letssee.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
import re
class PoemsSpider(CrawlSpider):
name = "poems"
allowed_domains = ['www.poets.org']
start_urls = ['https://www.poets.org/poetsorg/poems']
rules = (
Rule(LinkExtractor(allow= '/poem',canonicalize=True, unique=True),
callback="parse_item",follow=True),
)
def parse_item(self, response):
if re.match('.*/poetsorg/poem/.*',response.url) is not None:
poem = response.xpath('//*[@id="poem-content"]').get()
filename=response.url.split("/")[-1] + '.html'
try:
with open(filename, "w") as f:
f.write(response.url+'\n'+poem)
except:
with open(filename, "w", encoding="utf-8") as f:
f.write(response.url+'\n'+poem)
print("all good")
else:
print("its:"+response.url)