-
Notifications
You must be signed in to change notification settings - Fork 6
/
get_topword_from_cluster.py
executable file
·84 lines (65 loc) · 2.91 KB
/
get_topword_from_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from __future__ import absolute_import
import timeit
import argparse
from os import path
import numpy as np
import io
from autoencoder.preprocessing.preprocessing import load_corpus
from autoencoder.utils.io_utils import dump_json, load_json
def get_words(args):
corpus = load_corpus(args.input_corpus)
filename_corpus_dict = corpus['docs']
vocab_dict = corpus['vocab']
# we have to revort the dict
dictionary = dict((v,k) for k, v in vocab_dict.iteritems())
filename_label_dict = load_json(args.input_label)
print 'Finish loading data'
label_vocab_dict = {}
# start counting words
for filename in filename_corpus_dict:
vocab_num_dict = filename_corpus_dict[filename]
label = filename_label_dict[filename]
try:
label_vocab_dict[label]
except:
label_vocab_dict[label] = {}
for vocab in vocab_num_dict:
num = vocab_num_dict[vocab]
# print 'If num is a int? : ', isinstance(num, int)
try:
label_vocab_dict[label][vocab] += num
except:
label_vocab_dict[label][vocab] = num
print 'Finish counting word frequence'
label_topword_dict = {}
label_num = len(label_topword_dict)
print 'Label num is ', label_num
topn = args.topn
for label in label_vocab_dict:
vocab_num_dict = label_vocab_dict[label]
label_topword_dict[label] = sorted(vocab_num_dict, key = vocab_num_dict.__getitem__, reverse = True)[:topn]
print 'Finish sorting the top n word'
dump_json(label_topword_dict, args.output_json)
print 'Finish write the json file'
for label in label_topword_dict:
filename_o = args.output_dir + 'label-' + str(label) + '.txt'
print 'filename =' , filename_o
file_o = open(filename_o, 'w')
for word_index in label_topword_dict[label]:
# print 'Is word_index a int:', isinstance(word_index, int)
text = dictionary[int(word_index)]
text += '\n'
file_o.write(text.encode('utf-8'))
file_o.close()
print 'Finish writing files!'
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-ic', '--input_corpus', type=str, required = True, help = 'path of the input filename corpus dict')
parser.add_argument('-il', '--input_label', type=str, required = True, help = 'path of the input filename label dict')
parser.add_argument('-tn', '--topn', type=int, required = True, help = 'number of top words of a cluster')
parser.add_argument('-oj', '--output_json', type = str, required = True, help='path of the outpue json file')
parser.add_argument('-o', '--output_dir', type=str, required = True, help = 'dir of the output top n words file')
args = parser.parse_args()
get_words(args)
if __name__ == '__main__':
main()