Skip to content

Commit

Permalink
OCR optimization.
Browse files Browse the repository at this point in the history
  • Loading branch information
MrFlapstaart committed Aug 12, 2022
1 parent a57175d commit 44d5639
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 3 deletions.
7 changes: 6 additions & 1 deletion OCRResult.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ public class TextString : OCRDimBase
[XmlAttribute(AttributeName = "WC")]
public string WC { get; set; }

private string _content;
[XmlAttribute(AttributeName = "CONTENT")]
public string Content { get; set; }
public string Content
{
get { return _content; }
set { _content = value?.Replace("|", "I")?.Replace("[","I"); }
}
}

[XmlRoot(ElementName = "TextLine")]
Expand Down
15 changes: 13 additions & 2 deletions TextHelper.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System;
using System.Linq;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

Expand Down Expand Up @@ -58,11 +60,20 @@ internal static string RemoveGarbageText(string text)
internal static TextBlock ProcessTextBlock(TextBlock block)
{
Regex rgxspec = new Regex("[^a-zA-Z0-9.,'?!: -]");
List<TextString> words = new List<TextString>();
foreach (var line in block.Lines)
{
line.Words.RemoveAll(x => string.IsNullOrEmpty(rgxspec.Replace(x.Content.Replace("|", "I"), "").Trim()));
line.Words.RemoveAll(x => !IsValidWord(x.Content));
line.Words.RemoveAll(x => !IsValidWord(x.Content));
words.AddRange(line.Words);
}
double avgheight = words.Average(x => x.Height);
foreach (var line in block.Lines)
{
line.Words.RemoveAll(x => x.Height > avgheight * 1.5 || x.Height < avgheight * 0.5);
}


block.Lines.RemoveAll(x => x.WordsInLine == 1 && x.Text.Length <= 2);
block.Lines.RemoveAll(x => x.WordsInLine == 0);

Expand Down Expand Up @@ -90,7 +101,7 @@ private static bool IsValidWord(string word)
return false;


if (!word.StartsWith("..") && strip.Length <= 5 && strip.Length > 1 && !onlynumbers && !strip.StartsWith("hm"))
if (!word.StartsWith("..") && strip.Length <= 5 && strip.Length > 1 && !onlynumbers && !strip.StartsWith("hm") && strip != "you")
{
if (vowels == 0 || vowels == strip.Length)
return false;
Expand Down

0 comments on commit 44d5639

Please sign in to comment.