-
Notifications
You must be signed in to change notification settings - Fork 15
/
load_data.py
28 lines (23 loc) · 880 Bytes
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import numpy as np
import random
import math
chars = " _$0123456789abcdefghijklmnopqrstuvwxyz'&."
char_map = { c: i for i,c in enumerate(chars) }
def load_data(filename,batch_size=100):
words = [ line.strip() for line in open(filename) ]
random.shuffle(words)
batches = int(math.ceil(len(words)/float(batch_size)))
X_words = [ "%s%s "%(w,"_"*len(w)) for w in words ]
Y_words = [ "%s%s "%(w,w) for w in words ]
X_wordnum = np.array([ char_map[c] for w in X_words for c in w ], dtype=np.int8)
Y_wordnum = np.array([ char_map[c] for w in Y_words for c in w ], dtype=np.int8)
lengths = [ sum(len(w) for w in words[i*batch_size:(i+1)*batch_size])
for i in xrange(batches) ]
start_ends = []
start = 0
for l in lengths:
start_ends.append((start,start+l))
start += l
return X_wordnum, Y_wordnum, start_ends
if __name__ == '__main__':
print load_data('words')