-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert_to_scirex_json.py
47 lines (37 loc) · 1.22 KB
/
convert_to_scirex_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import argparse
import json
import spacy
nlp = spacy.load('en_core_web_sm')
import sys
import re
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--year', type=int)
args = parser.parse_args()
for line in sys.stdin:
data = json.loads(line)
if data['published_parsed'][0] != args.year:
continue
words = []
sentences = []
title = data['title']
title = re.sub('\n +', ' ', title)
summary = data['summary'].replace('\n', ' ')
tokens = [t.text for t in nlp(title)]
sentences.append((len(words), len(words)+len(tokens)))
words.extend(tokens)
for sent in nlp(summary).sents:
tokens = [t.text for t in nlp(sent.text, disable=["parser"])]
sentences.append((len(words), len(words)+len(tokens)))
words.extend(tokens)
doc_id = f"{data['id']}/{data['published_parsed'][0]}/{data['published_parsed'][1]}"
result = {
'doc_id': doc_id,
'words': words,
'sentences': sentences,
'sections': [[0, len(words)]],
'n_ary_relations': []
}
print(json.dumps(result))
if __name__ == '__main__':
main()