-
Notifications
You must be signed in to change notification settings - Fork 0
/
FrequencyCounterGoogle.java
145 lines (120 loc) · 4.58 KB
/
FrequencyCounterGoogle.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
* Class for reading frequency counts from the Google NGram Corpus.
*
* Copyright (C) 2013 Lisa Vitolo <lisavitolo90@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the Creative Commons
* Attribution-NonCommercial-ShareAlike 3.0 license.
* You should have received a copy of the license with this product.
* Otherwise, visit http://creativecommons.org/licenses/by-nc-sa/3.0/
*/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.Map;
/*
* When we don't know the words we are dealing with, we use the Google NGram Corpus, included
* in the release.
* See report for a discussion on how I used and preprocessed the corpus.
*/
public class FrequencyCounterGoogle implements FrequencyCounter
{
private Map<String, BigInteger> cache;
public FrequencyCounterGoogle()
{
this.cache = new HashMap<>();
}
public @Override BigInteger getFrequencyCount(String token, String tag)
{
token = token.toLowerCase();
String translatedTag = translateTag(tag);
String word = token + "_" + translatedTag;
if (cache.containsKey(word)) {
return cache.get(word);
}
String filename;
int initialIndex = 0;
/* When considering initials, skips punctuactions if they exist */
while (!Character.isLetter( token.charAt(initialIndex) )) {
initialIndex++;
if (initialIndex == token.length()) {
return new BigInteger("0");
}
}
if (token.substring(initialIndex).length() == 1) {
filename = "oneLetter"; /* special file for one-letter words */
} else {
/* Take the initial to know the directory, and two initials to know the file */
String initial = token.substring(initialIndex, initialIndex+1);
String initials = token.substring(initialIndex, initialIndex+2);
filename = initial + "/" + initials;
}
BigInteger frequencyCount = null;
try {
String filePath = Constants.getGoogleCorpusFolder() + filename;
BufferedReader br = new BufferedReader( new FileReader(filePath) );
String line;
/* Here we need linear search, since the files are not sorted lexicographically */
while ((line = br.readLine()) != null) {
/* Split entry into token and tag */
String[] fields = line.split("\t");
String[] w = fields[0].split("_");
if (w.length == 2 && w[0].equals(token) && w[1].equals(translatedTag)) {
frequencyCount = new BigInteger( fields[1] );
break;
}
}
br.close();
} catch (IOException e) {
/*
* NOTE: it's expected that we get here for "file not found" errors. This is because my
* corpus was reduced in order to contain only words present in an online English dictionary.
* So initials with non-English letters or numbers in them don't correspond to any file.
*/
return new BigInteger("0");
}
/* Nothing was found */
if (frequencyCount == null) {
frequencyCount = new BigInteger("0");
}
cache.put(word, frequencyCount);
return frequencyCount;
}
public @Override BigInteger getTotalCount()
{
/* Taken from the corpus "totals" file */
return new BigInteger("468491999592");
}
/* The corpus uses a quite particular tagset, listed at http://books.google.com/ngrams/info */
private String translateTag(String tag)
{
if (tag.equals("V")) {
return "VERB";
}
if (tag.equals("D")) {
return "DET";
}
if (tag.equals("&")) {
return "CONJ";
}
if (tag.equals("P")) {
return "ADP";
}
if (tag.equals("A")) {
return "ADJ";
}
if (tag.equals("R") || tag.equals("X")) {
return "ADV";
}
if (tag.equals("O")) {
return "PRON";
}
if (tag.equals("N") || tag.equals("S") || tag.equals("Z") || tag.equals("M")) {
return "NOUN";
}
return "PRT";
}
}