-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyses.py
170 lines (122 loc) · 4.6 KB
/
analyses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Implement linguistic analyses using spacy
# Run them on data/preprocessed/train/sentences.txt
import spacy
import string
import pandas as pd
from collections import Counter
nlp = spacy.load("en_core_web_sm")
with open("data/preprocessed/train/sentences.txt", encoding="utf-8") as f:
text = f.read()
doc = nlp(text)
len(list(doc.sents))
# 1. Tokenization (1 point)
# Process the dataset using the spaCy package and extract the following information:
token_list = []
for token in doc:
if token.text not in ["\n"]:
token_list.append(token.text)
# Number of tokens:
token_length = len(token_list)
print(token_length)
# Number of types:
types_length = len(list(set(token_list)))
print(types_length)
# Number of words:
# we compared this with spacy punctuation removal
punctuations = list(string.punctuation)
# total
words = [x for x in token_list if x not in punctuations]
words_length = len(words)
print(words_length)
# Average number of words per sentence:
av_no_words_sentence = words_length / (len(list(doc.sents)))
# Average word length:
average_word_length = sum(len(word) for word in words) / len(words)
# Provide the definition that you used to determine words:
"""
Word is something that is not in the punctuation from the string package. It is also not a new line. We don't
use lowercase and treat words with capital letter as separte ones.
"""
# 2. Word Classes (1.5 points)
# Run the default part-of-speech tagger on the dataset and identify the ten most frequent
# first part
tmp_list = []
for token in doc:
if token.text not in ["\n"] and token.text not in punctuations:
tmp_list.append({"token": token.text, "pos": token.pos_})
pos_df = pd.DataFrame(tmp_list)
freq_df = pos_df[['pos']].value_counts().reset_index()
freq_df.columns = ["pos", "freq"]
token_freq_df = pos_df[['pos', 'token']].value_counts().reset_index()
total_num = sum(list(freq_df['freq']))
freq_df['relative'] = freq_df['freq'] / total_num * 100
freq_df['relative'] = freq_df['relative'].apply(lambda x: round(x, 2))
# second part
tags = ['NOUN', 'PROPN', 'VERB', 'ADP', 'DET', 'ADJ', 'PRON', 'AUX', 'ADV', 'NUM']
tags_data = []
for tag in tags:
tmp_data = token_freq_df[token_freq_df['pos'] == tag]
three_frequent = list(tmp_data['token'].iloc[0:3])
one_infrequent = tmp_data['token'].iloc[-1]
tags_data.append({"pos": tag, "freq": three_frequent, "infreq": one_infrequent})
final_tokens = pd.DataFrame(tags_data)
# POS tags. Complete the table below for these ten tags (the tagger in the model
# en_core_web_sm is trained on the PENN Treebank tagset).
print("DONE")
# 3. N-Grams (1.5 points)
# Calculate the distribution of n-grams and provide the 3 most frequent
def get_n_grams(sentences, n=2):
total_list = []
for sent in sentences:
sent = str(sent).split()
words_zip = zip(*[sent[i:] for i in range(n)])
two_grams_list = [item for item in words_zip]
total_list.append(two_grams_list)
final = [item for sublist in total_list for item in sublist]
return final
def get_most_freq(grams_list):
count_freq = {}
for item in grams_list:
if item in count_freq:
count_freq[item] += 1
else:
count_freq[item] = 1
sorted_two_grams = sorted(count_freq.items(), key=lambda item: item[1], reverse=True)
return sorted_two_grams
input_sentences = list(doc.sents)
# POS bigrams:
tokens_bigrams = get_n_grams(input_sentences, 2)
final_bigrams = get_most_freq(tokens_bigrams)
print("token bigrams", final_bigrams[0], final_bigrams[1], final_bigrams[3])
# POS trigrams:
tokens_trigrams = get_n_grams(input_sentences, 3)
final_trigrams = get_most_freq(tokens_trigrams)
print("token trigrams", final_trigrams[0], final_trigrams[1], final_trigrams[3])
# 5. Named Entity Recognition (1 point)
# Number of named entities:
# Number of different entity labels:
# Analyze the named entities in the first five sentences. Are they identified correctly? If not,
# explain your answer and propose a better decision.
ner_labels = []
for token in doc.ents:
ner_labels.append(token.label_)
print("DONE")
# number of named entities
print(len(ner_labels))
# number of different entity labels
print(len(list(set(ner_labels))))
# first 5 sentences
print(input_sentences[:5])
for token in doc.ents:
ner_labels.append(token.label_)
tmp_list = []
for sent in input_sentences[:5]:
tmp_sent = nlp(str(sent)).ents
tmp_ner = []
tmp_tokens = []
for k in tmp_sent:
tmp_ner.append(k.label_)
tmp_tokens.append(k.text)
tmp_list.append({"sent": sent, "tokens": tmp_tokens, "ner": tmp_ner})
final_ner = pd.DataFrame(tmp_list)
print("DONE")