-
Notifications
You must be signed in to change notification settings - Fork 1
/
aggregate_mentions.py
72 lines (54 loc) · 2.19 KB
/
aggregate_mentions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
import sys
from collections import defaultdict, Counter
from scipy.stats import chisquare
def main():
mention_counts = defaultdict(Counter)
mode = sys.argv[1]
num_papers = Counter()
assert mode in {'yearly', 'yearly-test', 'monthly'}
for line in sys.stdin:
data = json.loads(line)
year, month = data['doc_id'].split('/')[-2:]
for type, mentions in data['mentions'].items():
if mode == 'yearly':
key = (year, type)
elif mode == 'yearly-test':
key = year
elif mode == 'monthly':
month = int(month)
key = f'{year}/{month:02d}'
mention_counts[key].update(mentions)
num_papers[key] += 1
if mode == 'yearly':
for key, counts in mention_counts.items():
print(key)
for m, c in counts.most_common(100):
print('\t', m, c)
elif mode == 'yearly-test':
total_2019 = num_papers['2019']
total_2020 = num_papers['2020']
total = total_2019 + total_2020
print('total', total_2019, total_2020)
for m, obs2 in mention_counts['2020'].most_common(100):
obs1 = mention_counts['2019'][m]
exp1 = (obs1 + obs2) * (total_2019 / total)
exp2 = (obs1 + obs2) * (total_2020 / total)
_, p = chisquare([obs1, obs2], [exp1, exp2])
sign = ''
if obs2 > exp2 and p < .01:
sign = 'up**'
if obs2 < exp2 and p < .01:
sign = 'down**'
print(f'{m}\t{obs1}\t{obs2}\t{exp1:.0f}\t{exp2:.0f}\t{p:4.3f}\t{sign}')
elif mode == 'monthly':
total_2020_counter = Counter()
sorted_keys = sorted(mention_counts.keys())
for key, counts in mention_counts.items():
if key.startswith('2020'):
total_2020_counter.update(counts)
print('total', ','.join(str(num_papers[key]) for key in sorted_keys))
for m, _ in total_2020_counter.most_common(100):
print(m, ','.join('{:3.2f}'.format(100. * mention_counts[key][m] / num_papers[key]) for key in sorted_keys))
if __name__ == '__main__':
main()