This repository has been archived by the owner on May 19, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
my_feature_ex.py
76 lines (64 loc) · 3.71 KB
/
my_feature_ex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import cmu_tweet_word_clusters
import nltk
from nltk.stem import WordNetLemmatizer
"""
Feature extraction routines:
- extract_tokens() -- simple set of tokens, ignoring stop words and non-word tokens
- extract_lemmatize_tokens() -- extends above using lemmatized tokens in place of actual tokens
- extract_bigrams() -- bigrams of simple tokens using above filtering rules
- extract_lemmatize_bigrams() -- extends above using lemmatized tokens in place of actual tokens
"""
stop_words = nltk.corpus.stopwords.words('english')
#definition of categories can be found at: http://www.ark.cs.cmu.edu/TweetNLP/annot_guidelines.pdf
pos_nominal = ['N', 'O', '^', 'S', 'Z']
pos_noun = ['N', '^', 'Z']
pos_other_open_class = ['V', 'A', 'R', '!']
pos_other_closed_calss = ['D', 'P', '&', 'Y', 'X']
pos_twitter = ['#', '@', '~', 'U', 'E']
pos_misc = ['$', ',', 'G']
pos_compound = ['L', 'M', 'Y']
pos_all = pos_nominal + pos_other_open_class + pos_other_closed_calss + pos_twitter + pos_misc + pos_compound
pos_default_filter = set(pos_all).difference((',', 'U', 'G', 'E', '$'))
def filter_token(t,p,f, filter='default', exclude=[]):
if filter == 'default':
pos_filter = pos_default_filter
elif filter == 'all':
pos_filter = pos_all
elif filter == 'nominal':
pos_filter = pos_nominal
elif filter == 'noun':
pos_filter = pos_noun
elif filter == 'twitter':
pos_filter = pos_twitter
elif filter == 'hashtag':
pos_filter = ['#']
else:
raise Error(filter)
return t.lower() not in stop_words and t.lower() not in exclude and p in pos_filter
def reset_features(tweets):
for tweet in tweets:
tweet['tokens'] = []
def extract_tokens(tweets, filter='default', exclude=[]):
for tweet in tweets:
tweet['tokens'] = tweet['tokens'] + [t.lower() for t,p,f in tweet['pos'] if filter_token(t,p,f,filter=filter, exclude=exclude)]
def extract_lemmatize_tokens(tweets, filter='default', exclude=[]):
wnl = WordNetLemmatizer()
for tweet in tweets:
tweet['tokens'] = tweet['tokens'] + [wnl.lemmatize(t.lower()) for t,p,f in tweet['pos'] if filter_token(t,p,f,filter=filter, exclude=exclude)]
def extract_group_tokens(tweets, filter='default', exclude=[]):
for tweet in tweets:
tweet['tokens'] = tweet['tokens'] + [cmu_tweet_word_clusters.cmu_classify(t.lower()) for t,p,f in tweet['pos'] if filter_token(t,p,f,filter=filter, exclude=exclude)]
def extract_bigrams(tweets, filter='default', exclude=[]):
for tweet in tweets:
bigrams = zip(tweet['pos'][0:-1], tweet['pos'][1:])
tweet['tokens'] = tweet['tokens'] + ['{}_{}'.format(pos1[0].lower(), pos2[0].lower()) for pos1, pos2 in bigrams if filter_token(*pos1,filter=filter,exclude=exclude) and filter_token(*pos2,filter=filter,exclude=exclude)]
def extract_lemmatize_bigrams(tweets, filter='default', exclude=[]):
wnl = WordNetLemmatizer()
for tweet in tweets:
bigrams = zip(tweet['pos'][0:-1], tweet['pos'][1:])
tweet['tokens'] = tweet['tokens'] + ['{}_{}'.format(wnl.lemmatize(pos1[0].lower()), wnl.lemmatize(pos2[0].lower())) for pos1, pos2 in bigrams if filter_token(*pos1,filter=filter,exclude=exclude) and filter_token(*pos2,filter=filter,exclude=exclude)]
def extract_group_bigrams(tweets, filter='default', exclude=[]):
wnl = WordNetLemmatizer()
for tweet in tweets:
bigrams = zip(tweet['pos'][0:-1], tweet['pos'][1:])
tweet['tokens'] = tweet['tokens'] + ['{}_{}'.format(cmu_tweet_word_clusters.cmu_classify(pos1[0].lower()), cmu_tweet_word_clusters.cmu_classify(pos2[0].lower())) for pos1, pos2 in bigrams if filter_token(*pos1,filter=filter,exclude=exclude) and filter_token(*pos2,filter=filter,exclude=exclude)]