-
Notifications
You must be signed in to change notification settings - Fork 10
/
7. predict.py
67 lines (58 loc) · 1.9 KB
/
7. predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import nltk, re, random
from nltk.tokenize import word_tokenize
from collections import defaultdict, deque
from document1 import training_doc1
from document2 import training_doc2
from document3 import training_doc3
class MarkovChain:
def __init__(self):
self.lookup_dict = defaultdict(list)
self._seeded = False
self.__seed_me()
def __seed_me(self, rand_seed=None):
if self._seeded is not True:
try:
if rand_seed is not None:
random.seed(rand_seed)
else:
random.seed()
self._seeded = True
except NotImplementedError:
self._seeded = False
def add_document(self, str):
preprocessed_list = self._preprocess(str)
pairs = self.__generate_tuple_keys(preprocessed_list)
for pair in pairs:
self.lookup_dict[pair[0]].append(pair[1])
def _preprocess(self, str):
cleaned = re.sub(r'\W+', ' ', str).lower()
tokenized = word_tokenize(cleaned)
return tokenized
def __generate_tuple_keys(self, data):
if len(data) < 1:
return
for i in range(len(data) - 1):
yield [ data[i], data[i + 1] ]
def generate_text(self, max_length=50):
context = deque()
output = []
if len(self.lookup_dict) > 0:
self.__seed_me(rand_seed=len(self.lookup_dict))
chain_head = [list(self.lookup_dict)[0]]
context.extend(chain_head)
while len(output) < (max_length - 1):
next_choices = self.lookup_dict[context[-1]]
if len(next_choices) > 0:
next_word = random.choice(next_choices)
context.append(next_word)
output.append(context.popleft())
else:
break
output.extend(list(context))
return " ".join(output)
my_markov = MarkovChain()
my_markov.add_document(training_doc1)
my_markov.add_document(training_doc2)
my_markov.add_document(training_doc3)
generated_text = my_markov.generate_text()
print(generated_text)