-
Notifications
You must be signed in to change notification settings - Fork 0
/
laod.py
41 lines (32 loc) · 1.2 KB
/
laod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from bs4 import BeautifulSoup
import re
def laod_news():
content = []
for page in range(1,23):
urls = 'https://laod.cn/news/page/'+str(page)
res = requests.get(urls)
text = res.text
soup = BeautifulSoup(text,"lxml")
list2 = soup.find_all('article',class_=('wow'))
for href in list2:
a_href = href.find('a')
url = a_href['href']
content_dict = contentparse(url)
content.append(content_dict)
return content
def contentparse(url):
res = requests.get(url)
text = res.text
soup = BeautifulSoup(text,'lxml')
title = soup.find('h1',attrs={'class':'entry-title'}).getText()
reg = '([0-9]{4})-([0-9]{2})-([0-9]{2})'
time = soup.find('ul',attrs={'class':'spostinfo'}).getText()
date = re.search(reg,time).group()
print(url)
context = str(soup.find('div',attrs={'class':'single-content'}))
content = re.sub('<a(.*?)>', "",context.replace('>\n', '>').replace('single-content', 'entry').replace('</a>', ''))
content_dict = {'title': title, 'date': date, 'content': content, 'src': url, 'mid': '5'}
return content_dict
if __name__ == '__main__':
laod_news()