-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextHelper.cs
177 lines (141 loc) · 5.88 KB
/
TextHelper.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
namespace GameOCRTTS
{
internal static class TextHelper
{
internal static string RemoveGarbageText(string text)
{
if (string.IsNullOrEmpty(text))
return "";
Regex rgx = new Regex("[^a-zA-Z0-9']");
Regex rgxnum = new Regex("[^0-9]");
var result = new StringBuilder();
string parsedtext = text
.Replace("..", " ..")
.Replace("?", "? ")
.Replace("!", "! ")
.Replace(".", ". ")
.Replace(". .", "..")
.Replace(". .", "..")
.Replace("' Il", "'ll")
.Replace("'Il", "'ll")
.Replace(" ", " ");
string[] words = parsedtext.Split(' ');
int wordcount = words.Length;
int wordidx = 0;
foreach (string word in words)
{
wordidx++;
if (string.IsNullOrEmpty(word))
continue;
string strip = rgx.Replace(word, "")?.ToLower();
string numbers = rgxnum.Replace(word, "");
bool onlynumbers = numbers.Length == strip.Length && numbers.Length > 0;
// Unlikely text block starts or ends with a number
if (onlynumbers && (wordidx == 1 || wordidx == wordcount ))
continue;
// 'I' is probably never the last word in a sentence.
if ((strip == "i" || strip == "a") && word.Length == 1 && wordidx == wordcount)
continue;
if (strip.Length == 1 && !onlynumbers && strip != "i" && strip != "a")
continue;
if (!IsValidWord(word))
continue;
result.Append(FixBrokenWord(word) + " ");
}
return result.ToString();
}
private static string FixBrokenWord(string word)
{
if (word == "Il")
return "I'll";
else if (word.ToLower().StartsWith("c'mon"))
return word.ToLower().Replace("c'mon", "come on");
else
return word;
}
internal static TextBlock ProcessTextBlock(TextBlock block)
{
Regex rgxspec = new Regex("[^a-zA-Z0-9.,'?!: -]");
List<TextString> words = new List<TextString>();
foreach (var line in block.Lines)
{
line.Words.RemoveAll(x => string.IsNullOrEmpty(rgxspec.Replace(x.Content, "").Trim()));
line.Words.RemoveAll(x => !IsValidWord(x.Content));
words.AddRange(line.Words);
}
//double avgheight = words.Average(x => x.Height);
//foreach (var line in block.Lines)
//{
// line.Words.RemoveAll(x => x.Height > avgheight * 1.5 || x.Height < avgheight * 0.5);
//}
block.Lines.RemoveAll(x => x.WordsInLine == 1 && x.Text.Length <= 2);
block.Lines.RemoveAll(x => x.WordsInLine == 0);
return block;
}
private static bool IsValidWord(string word)
{
if (string.IsNullOrEmpty(word))
return false;
Regex rgx = new Regex("[^a-zA-Z0-9']");
Regex rgxcap = new Regex("[^A-Z0-9]");
Regex rgxnum = new Regex("[^0-9]");
Regex rgxvowel = new Regex("[^aeiouy]");
string strip = rgx.Replace(word, "")?.ToLower()?.Trim();
string stripcap = rgxcap.Replace(word, "")?.Trim();
if (word.StartsWith(".."))
return true;
if (string.IsNullOrEmpty(strip))
return false;
if (strip.Length == 1 && strip != "i" && strip != "a")
return false;
if (strip.Length == 1 && word.Length >= 3)
return false;
string numbers = rgxnum.Replace(word, "");
bool onlynumbers = numbers.Length == strip.Length && numbers.Length > 0;
int vowels = rgxvowel.Replace(strip, "").Length;
if (numbers.Length > 0 && !onlynumbers)
return false;
// Word with multiple capitals mixed with non-capitals, porbably not a valid word.
if (stripcap.Length > 1 && stripcap.Length < strip.Length)
return false;
if (strip.Length == 1 && !onlynumbers && !ValidVowelWord(strip))
return false;
if (!word.StartsWith("..") && strip.Length <= 5 && strip.Length > 1 && !onlynumbers && !ValidConsonantWord(strip) && !ValidVowelWord(strip))
{
if (vowels == 0 || vowels == strip.Length)
return false;
}
return true;
}
private static bool ValidVowelWord(string word)
{
string text = word?.ToLower();
bool result =
text == "you" ||
text == "eye" ||
text == "i" ||
text == "yay" ||
text == "a";
return result;
}
private static bool ValidConsonantWord(string word)
{
string text = word?.ToLower() ?? "";
bool result =
text.StartsWith("hm") ||
text.StartsWith("zz");
return result;
}
internal static string StripSpecialCharacters(string text)
{
if (string.IsNullOrEmpty(text))
return text;
Regex rgx = new Regex("[^a-zA-Z0-9.,'?!: -]");
string result = rgx.Replace(text.Replace("|", "I"), "");
return result;
}
}
}