Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag for Page Segmentation Modes control #1601

Merged
merged 10 commits into from
Sep 3, 2024
1 change: 1 addition & 0 deletions docs/CHANGES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
- Fix: infinite loop in MP4 file type detector.
- Improvement: Use Corrosion to build Rust code
- Improvement: Ignore MXF Caption Essence Container version byte to enhance SRT subtitle extraction compatibility
- New: Add tesseract page segmentation modes control with `--psm` flag

0.94 (2021-12-14)
-----------------
Expand Down
1 change: 1 addition & 0 deletions src/lib_ccx/ccx_common_option.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ void init_options(struct ccx_s_options *options)
options->dvblang = NULL; // By default, autodetect DVB language
options->ocrlang = NULL; // By default, autodetect .traineddata file
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
options->ocr_quantmode = 1; // CCExtractor's internal
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
Expand Down
1 change: 1 addition & 0 deletions src/lib_ccx/ccx_common_option.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct ccx_s_options // Options from user parameters
char *dvblang; // The name of the language stream for DVB
const char *ocrlang; // The name of the .traineddata file to be loaded with tesseract
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
char *mkvlang; // The name of the language stream for MKV
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.
Expand Down
3 changes: 3 additions & 0 deletions src/lib_ccx/ocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ void *init_ocr(int lang_index)
&pars_values, 1, false);
}

// set PSM mode
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);

free(pars_vec);
free(pars_values);

Expand Down
38 changes: 38 additions & 0 deletions src/lib_ccx/params.c
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,23 @@ void print_usage(void)
mprint(" Default value depends on the tesseract version linked :\n");
mprint(" Tesseract v3 : default mode is 0,\n");
mprint(" Tesseract v4 : default mode is 1.\n");
mprint(" --psm: Select the PSM mode for Tesseract.\n");
mprint(" Available Page segmentation modes:\n");
mprint(" 0 Orientation and script detection (OSD) only.\n");
mprint(" 1 Automatic page segmentation with OSD.\n");
mprint(" 2 Automatic page segmentation, but no OSD, or OCR.\n");
mprint(" 3 Fully automatic page segmentation, but no OSD. (Default)\n");
mprint(" 4 Assume a single column of text of variable sizes.\n");
mprint(" 5 Assume a single uniform block of vertically aligned text.\n");
mprint(" 6 Assume a single uniform block of text.\n");
mprint(" 7 Treat the image as a single text line.\n");
mprint(" 8 Treat the image as a single word.\n");
mprint(" 9 Treat the image as a single word in a circle.\n");
mprint(" 10 Treat the image as a single character.\n");
mprint(" 11 Sparse text. Find as much text as possible in no particular order.\n");
mprint(" 12 Sparse text with OSD.\n");
mprint(" 13 Raw line. Treat the image as a single text line,\n");
mprint(" bypassing hacks that are Tesseract-specific.\n");
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
mprint(" stream will be processed. e.g. 'eng' for English.\n");
mprint(" Language codes can be either the 3 letters bibliographic\n");
Expand Down Expand Up @@ -1696,6 +1713,27 @@ int parse_parameters(struct ccx_s_options *opt, int argc, char *argv[])
fatal(EXIT_MALFORMED_PARAMETER, "--oem has no argument.\n");
}
}
if (strcmp(argv[i], "--psm") == 0)
{
if (i < argc - 1)
{
i++;

char *str = (char *)malloc(sizeof(argv[i]));
sprintf(str, "%s", argv[i]);
opt->psm = atoi(str);
if (opt->psm < 0 || opt->psm > 13)
{
fatal(EXIT_MALFORMED_PARAMETER, "--psm must be between 0 and 13\n");
}

continue;
}
else
{
fatal(EXIT_MALFORMED_PARAMETER, "--psm has no argument.\n");
}
}
if (strcmp(argv[i], "--mkvlang") == 0)
{
if (i < argc - 1)
Expand Down
2 changes: 2 additions & 0 deletions src/lib_ccx/params_dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ void params_dump(struct lib_ccx_ctx *ctx)
mprint("Reduced color palette]\n");
break;
}

mprint("[Tesseract PSM: %d]\n", ccx_options.psm);
}

#define Y_N(cond) ((cond) ? "Yes" : "No")
Expand Down
3 changes: 3 additions & 0 deletions src/rust/lib_ccxr/src/common/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,8 @@ pub struct Options {
pub ocrlang: PathBuf,
/// The Tesseract OEM mode, could be 0 (default), 1 or 2
pub ocr_oem: i8,
/// The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
pub psm: i32,
/// How to quantize the bitmap before passing to to tesseract
/// (0 = no quantization at all, 1 = CCExtractor's internal,
/// 2 = reduce distinct color count in image for faster results.)
Expand Down Expand Up @@ -589,6 +591,7 @@ impl Default for Options {
dvblang: Default::default(),
ocrlang: Default::default(),
ocr_oem: -1,
psm: 3,
ocr_quantmode: 1,
mkvlang: Default::default(),
analyze_video_stream: Default::default(),
Expand Down
19 changes: 19 additions & 0 deletions src/rust/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,25 @@ pub struct Args {
/// Tesseract v4 : default mode is 1.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub oem: Option<u8>,
/// Select the PSM mode for Tesseract.
/// Available Page segmentation modes:
/// 0 Orientation and script detection (OSD) only.
/// 1 Automatic page segmentation with OSD.
/// 2 Automatic page segmentation, but no OSD, or OCR.
/// 3 Fully automatic page segmentation, but no OSD. (Default)
/// 4 Assume a single column of text of variable sizes.
/// 5 Assume a single uniform block of vertically aligned text.
/// 6 Assume a single uniform block of text.
/// 7 Treat the image as a single text line.
/// 8 Treat the image as a single word.
/// 9 Treat the image as a single word in a circle.
/// 10 Treat the image as a single character.
/// 11 Sparse text. Find as much text as possible in no particular order.
/// 12 Sparse text with OSD.
/// 13 Raw line. Treat the image as a single text line,
/// bypassing hacks that are Tesseract-specific.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub psm: Option<u8>,
/// For MKV subtitles, select which language's caption
/// stream will be processed. e.g. 'eng' for English.
/// Language codes can be either the 3 letters bibliographic
Expand Down
10 changes: 10 additions & 0 deletions src/rust/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,16 @@ impl OptionsExt for Options {
self.ocr_oem = *oem as _;
}

if let Some(ref psm) = args.psm {
if !(0..=13).contains(psm) {
fatal!(
cause = ExitCause::MalformedParameter;
"--psm must be between 0 and 13"
);
}
self.psm = *psm as _;
}

if let Some(ref lang) = args.mkvlang {
self.mkvlang = Some(Language::from_str(lang.as_str()).unwrap());
let str = lang.as_str();
Expand Down
Loading