forked from wesleyclzns/resumidor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
resumidor.py
31 lines (28 loc) · 1.3 KB
/
resumidor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#Código Limpo 17-04-2019
import nltk
nltk.download()
import urllib as urllib
from urllib.request import Request,urlopen
link = Request('http://ultimosegundo.ig.com.br/politica/2017-04-25/reforma-da-previdencia.html',headers={'User-Agent' :'Google Chrome/67'})
pagina = urlopen(link) .read() .decode('utf-8', 'ignore')
from bs4 import BeautifulSoup
soup = BeautifulSoup(pagina, "html.parser")
texto = soup.find(id="noticia").text
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
sentencas = sent_tokenize(texto)
palavras = word_tokenize(texto.lower())
from nltk.corpus import stopwords
from string import punctuation
stopwords = set(stopwords.words('portuguese') + list(punctuation))
palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
from nltk.probability import FreqDist
frequencia = FreqDist(palavras_sem_stopwords)
from collections import defaultdict
sentencas_importantes = defaultdict(int)
for i, sentenca in enumerate(sentencas):
for palavra in word_tokenize(sentenca.lower()):
if palavra in frequencia: sentencas_importantes[i] += frequencia[palavra]
from heapq import nlargest
idx_sentencas_importantes = nlargest(4, sentencas_importantes,sentencas_importantes.get)
for i in sorted(idx_sentencas_importantes):print(sentencas[i])