-
Notifications
You must be signed in to change notification settings - Fork 293
/
demo_japanese.py
38 lines (31 loc) · 1.58 KB
/
demo_japanese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pandas as pd
from urllib.request import urlopen
import scattertext as st
def main():
shisei = _parse_geutenberg('http://www.gutenberg.org/files/31617/31617-0.txt')
horadanshaku = _parse_geutenberg('http://www.gutenberg.org/files/34084/34084-0.txt')
df = pd.DataFrame({'text': [shisei, horadanshaku],
'title': ['Shisei', 'Horadanshaku tabimiyage'],
'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki']})
df['text'] = df['text'].apply(st.japanese_nlp)
corpus = st.CorpusFromParsedDocuments(df,
category_col='title',
parsed_col='text').build()
html = st.produce_scattertext_explorer(corpus,
category='Shisei',
category_name='Shisei',
not_category_name='Horadanshaku tabimiyage',
minimum_term_frequency=5,
width_in_pixels=1000,
metadata=df['title'] + ' by ' + df['author'],
asian_mode=True)
open('./demo_japanese.html', 'w').write(html)
print('Open ./demo_japanese.html in Chrome or Firefox.')
def _parse_geutenberg(url):
return (urlopen(url)
.read()
.decode('utf-8')
.split("Transcriber's Notes")[0]
.split('-------------------------------------------------------')[-1])
if __name__ == '__main__':
main()