This repository has been archived by the owner on May 18, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Article.py
180 lines (163 loc) · 6.85 KB
/
Article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import logging
import re
from nltk import tokenize
from nltk.stem import WordNetLemmatizer
import safe_IO
from dictionary import *
from safe_IO import *
logger = logging.getLogger('FAIDK.Article')
class Article(object):
def __init__(self, config, file, FLAG):
self.name = file
self.config = config
self.mark = self.config["MARK"]
self.file_path = config['MAIN_PATH'] + config['ARTICLES_PATH'] + file
self.fix_dic = load_lemmatization_list_to_dic(
self.config['LEMMATIZATION_MODE'])
self.known_words = self.read_known_words()
self.article = read_article_from_file(self.file_path)
self.words = self.split_the_article(self.article, self.file_path)
self.new_words = self.read_new_words()
self.num = len(self.new_words)#会被修改,独占,勿用
self.keys = self.load_keys()
if FLAG == '1':
self.learn()
if FLAG=='2':
self.finish()
if self.config['OUT_PUT_MARKED_ARTICLE']:
self.out_put_markded_article()
self.out_put_important_sentences()
self.out_put_vocabulary()
def out_put_markded_article(self):
self.pattern = re.compile(r"(\b"+r"\b|\b".join(self.new_words)+"\b)",flags=re.IGNORECASE)
self.marked_article = re.sub(self.pattern,self.mark+r"\1"+self.mark,self.article)
self.marked_article = re.sub(r"\n", r"\n\n", self.marked_article)
write_marked_article_to_file("./others/",self.name, self.marked_article)
def out_put_important_sentences(self):
pp_m_article = re.sub(r"\n",r".\n",self.marked_article)
sentences = tokenize.sent_tokenize(pp_m_article)
i_sentences = [_ if self.pattern.search(_) else None for _ in sentences]
write_important_sentances_to_file("./others/",self.name, "\n\n".join(list(filter(None,i_sentences))))
def out_put_vocabulary(self):
vocabulary = []
num = len(self.new_words)
for i,_ in enumerate(self.new_words):
vocabulary.append("#### "+_+"\n\n"+google(_)+"\n\n"+eudic(_))
logger.info("looking up "+_+" ("+str(num-i)+"left)")
vocabulary = "\n\n".join(vocabulary)
write_vocabulary_to_file("./others/",self.name,vocabulary)
def load_keys(self):
f = self.config
keys = [f['KEY_FOR_KNOW'], f['KEY_FOR_NOT'], f['KEY_FOR_QUIT']]
logger.debug(keys)
return keys
def read_old_words(self, path):
try:
return read_article_from_file(path)
except:
logger.info('missing ' + path)
return ''
def real_word(self, word, LEMMATIZATION_flag=True):
'''
find the real word
'''
p_forword = re.compile('[a-z,A-Z,\',‘]')
word_s = p_forword.findall(word)
real_word = ''.join(word_s)#.lower()
if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']:
try:
real_word = self.fix_dic[real_word]
except Exception as e:
logger.debug(e)
pass
if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']:
wordnet_lemmatizer = WordNetLemmatizer()
real_word = wordnet_lemmatizer.lemmatize(real_word)
logger.debug(word+'-->'+real_word)
return real_word
def read_known_words(self):
'''
load the word have known
'''
try:
with open(self.config['MAIN_PATH'] + self.config['OLD_WORDS_PATH'])as f:
all_the_words = f.read()
except:
logger.info('\'' + self.config['MAIN_PATH'] +
self.config['OLD_WORDS_PATH'] + '\' missing......')
all_the_words = ""
known_words = self.split_the_article(all_the_words,LEMMATIZATION_flag=False)
logger.info(known_words)
num = len(known_words)
logger.info('There are ' + str(num) + ' words I have known')
return known_words
def split_the_article(self, Article, name=None, LEMMATIZATION_flag=True):
'''
split the article
'''
sep = re.compile('[ \r\n.,'+self.config['SPECIAL_PUNCTUATION']+' ]')
logger.debug(sep)
words = re.split(sep, Article)
filcts = (self.real_word(word,LEMMATIZATION_flag=LEMMATIZATION_flag) for word in words)
set_of_words = set(filcts)
if name == None:
pass
else:
logger.info('there are {} words in {}'.format(
len(set_of_words), name))
logger.debug(set_of_words)
return set_of_words
def read_new_words(self):
'''
read new words from article
'''
new_words = self.words - self.known_words
num = len(new_words)
if num == 0:
logger.info('No new word')
elif num == 1:
logger.info('only 1 new word')
else:
logger.info(str(num) + ' new words')
logger.info(new_words)
return sorted(new_words)
def learn(self):
'''
learn new words & build
'''
logger.info('if you know the word {}, else print {}'.format(self.config['KEY_FOR_KNOW'],self.config['KEY_FOR_NOT']))
for word in self.new_words:
judge = my_input(word+'('+str(self.num)+' Left)',self.keys)
if judge == self.config['KEY_FOR_QUIT']:
self.user_exit()
return
if judge == self.config['KEY_FOR_KNOW']:
self.known_words.add(word)
self.num -= 1
self.new_words = sorted(set(self.new_words) - self.known_words)
if self.new_words:
logger.info('new words ({}):'.format(len(self.new_words)))
logger.info(self.new_words)
self.finish()
def user_exit(self):
write_each_words(
self.config['ARTICLES_PATH'], 'l_'+self.name, list(self.new_words)[-self.num:])
logger.debug('writing left words')
logger.debug(self.new_words[-self.num:])
logger.debug('get new words')
self.new_words = set(self.new_words[:-self.num])-self.known_words
logger.debug(self.new_words)
self.finish()
def finish(self):
CONFIG = self.config
NEW_WORDS_EACH_ARTICLE_PATH = CONFIG['MAIN_PATH'] + \
CONFIG['NEW_WORDS_EACH_ARTICLE_PATH']
safe_IO.mv_file(self.file_path, CONFIG['MAIN_PATH'] +
CONFIG['OLD_ARTICLES_PATH'])
safe_IO.write_each_words(
NEW_WORDS_EACH_ARTICLE_PATH, self.name, self.new_words)
with open(CONFIG['MAIN_PATH'] + CONFIG['OLD_WORDS_PATH'], 'w') as old:
old.write('\n'.join(self.known_words))
logger.debug('write new words to '+CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH'])
with open(CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH'], 'a') as new:
new.write('\n'.join(self.new_words))