-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
101 lines (70 loc) · 3.15 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# if spacy.load(lang) doesn't work run spacy_ext.sh
import pickle
import spacy
from pymystem3 import Mystem
import re
def clean_twitter_data(texts):
return [re.sub(r"(?:\@|https?\://)\S+", "", text) for text in texts]
def get_data(what):
assert what in ['ted', 'twitter', 'dicts']
if what is 'ted':
with open("./data/ted/ted_parallel_en.pkl", "rb") as f:
data_en = pickle.load(f)
data_en = [text.replace("\n", " ") for text in data_en]
with open("./data/ted/ted_parallel_fr.pkl", "rb") as f:
data_fr = pickle.load(f)
data_fr = [text.replace("\n", " ") for text in data_fr]
with open("./data/ted/ted_parallel_ru.pkl", "rb") as f:
data_ru = pickle.load(f)
data_ru = [text.replace("\n", " ") for text in data_ru]
return data_en, data_fr, data_ru
if what is 'twitter':
with open("./data/twitter/greenpeaceusa.pkl", "rb") as f:
data_en = pickle.load(f)
data_en = [pair[1].replace("\n", " ") for pair in data_en]
data_en = clean_twitter_data(data_en)
with open("./data/twitter/greenpeacefr.pkl", "rb") as f:
data_fr = pickle.load(f)
data_fr = [pair[1].replace("\n", " ") for pair in data_fr]
data_fr = clean_twitter_data(data_fr)
with open("./data/twitter/greenpeaceru.pkl", "rb") as f:
data_ru = pickle.load(f)
data_ru = [pair[1].replace("\n", " ") for pair in data_ru]
data_ru = clean_twitter_data(data_ru)
return data_en, data_fr, data_ru
def lemmatize(texts, lang):
assert lang in ['fr', 'en', 'ru']
if lang is 'ru':
m = Mystem()
texts_lem = ["".join(m.lemmatize(text)) for text in texts]
else:
nlp = spacy.load(lang)
texts_lem = [" ".join([w.lemma_ for w in nlp(text)]) for text in texts]
texts_lem = [text.replace("\n", " ") for text in texts_lem]
return texts_lem
def preprocess_texts():
ted_en, ted_fr, ted_ru = get_data(what="ted")
twitter_en, twitter_fr, twitter_ru = get_data(what="twitter")
print("Loaded files")
ted_en = lemmatize(ted_en, lang="en")
ted_fr = lemmatize(ted_fr, lang="fr")
ted_ru = lemmatize(ted_ru, lang="ru")
print("Lemmatized ted data")
twitter_en = lemmatize(twitter_en, lang="en")
twitter_fr = lemmatize(twitter_fr, lang="fr")
twitter_ru = lemmatize(twitter_ru, lang="ru")
print("Lemmatized twitter data")
with open("./data/preprocessed/ted/ted_en.pkl", 'wb') as f:
pickle.dump(ted_en, f)
with open("./data/preprocessed/ted/ted_fr.pkl", 'wb') as f:
pickle.dump(ted_fr, f)
with open("./data/preprocessed/ted/ted_ru.pkl", 'wb') as f:
pickle.dump(ted_ru, f)
with open("./data/preprocessed/twitter/twitter_en.pkl", 'wb') as f:
pickle.dump(twitter_en, f)
with open("./data/preprocessed/twitter/twitter_fr.pkl", 'wb') as f:
pickle.dump(twitter_fr, f)
with open("./data/preprocessed/twitter/twitter_ru.pkl", 'wb') as f:
pickle.dump(twitter_ru, f)
print("Saved all the preprocessed files to the ./data/preprocessed dir")
preprocess_texts()