Skip to content

Commit 349020e

Browse files
Add flag for Page Segmentation Modes control (#1601)
* Add flag for Page Segmentation Modes control I added an flag --psm for controlling PSM (Page Segmentation Modes) in Tesseract. The default option (3) gives me quite bad results. When I use 6, 11, or 12 for Bulgarian, it gives me much better OCR results. I haven't tested other languages yet, but I expect improvements as well if other mode is used. * feat: add psm for rust parser * fix: add psm to options * fix: add default value of psm to 3 * fix: correct type of ocr oem * fix(rust): use fatal! instead of exit --------- Co-authored-by: Prateek Sunal <prtksunal@gmail.com>
1 parent 1a13bbb commit 349020e

File tree

9 files changed

+78
-0
lines changed

9 files changed

+78
-0
lines changed

docs/CHANGES.TXT

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
- Fix: infinite loop in MP4 file type detector.
3636
- Improvement: Use Corrosion to build Rust code
3737
- Improvement: Ignore MXF Caption Essence Container version byte to enhance SRT subtitle extraction compatibility
38+
- New: Add tesseract page segmentation modes control with `--psm` flag
3839

3940
0.94 (2021-12-14)
4041
-----------------

src/lib_ccx/ccx_common_option.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ void init_options(struct ccx_s_options *options)
7272
options->dvblang = NULL; // By default, autodetect DVB language
7373
options->ocrlang = NULL; // By default, autodetect .traineddata file
7474
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
75+
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
7576
options->ocr_quantmode = 1; // CCExtractor's internal
7677
options->mkvlang = NULL; // By default, all the languages are extracted
7778
options->ignore_pts_jumps = 1;

src/lib_ccx/ccx_common_option.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ struct ccx_s_options // Options from user parameters
147147
char *dvblang; // The name of the language stream for DVB
148148
const char *ocrlang; // The name of the .traineddata file to be loaded with tesseract
149149
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
150+
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
150151
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
151152
char *mkvlang; // The name of the language stream for MKV
152153
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.

src/lib_ccx/ocr.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,9 @@ void *init_ocr(int lang_index)
177177
&pars_values, 1, false);
178178
}
179179

180+
// set PSM mode
181+
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);
182+
180183
free(pars_vec);
181184
free(pars_values);
182185

src/lib_ccx/params.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,23 @@ void print_usage(void)
679679
mprint(" Default value depends on the tesseract version linked :\n");
680680
mprint(" Tesseract v3 : default mode is 0,\n");
681681
mprint(" Tesseract v4 : default mode is 1.\n");
682+
mprint(" --psm: Select the PSM mode for Tesseract.\n");
683+
mprint(" Available Page segmentation modes:\n");
684+
mprint(" 0 Orientation and script detection (OSD) only.\n");
685+
mprint(" 1 Automatic page segmentation with OSD.\n");
686+
mprint(" 2 Automatic page segmentation, but no OSD, or OCR.\n");
687+
mprint(" 3 Fully automatic page segmentation, but no OSD. (Default)\n");
688+
mprint(" 4 Assume a single column of text of variable sizes.\n");
689+
mprint(" 5 Assume a single uniform block of vertically aligned text.\n");
690+
mprint(" 6 Assume a single uniform block of text.\n");
691+
mprint(" 7 Treat the image as a single text line.\n");
692+
mprint(" 8 Treat the image as a single word.\n");
693+
mprint(" 9 Treat the image as a single word in a circle.\n");
694+
mprint(" 10 Treat the image as a single character.\n");
695+
mprint(" 11 Sparse text. Find as much text as possible in no particular order.\n");
696+
mprint(" 12 Sparse text with OSD.\n");
697+
mprint(" 13 Raw line. Treat the image as a single text line,\n");
698+
mprint(" bypassing hacks that are Tesseract-specific.\n");
682699
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
683700
mprint(" stream will be processed. e.g. 'eng' for English.\n");
684701
mprint(" Language codes can be either the 3 letters bibliographic\n");
@@ -1696,6 +1713,27 @@ int parse_parameters(struct ccx_s_options *opt, int argc, char *argv[])
16961713
fatal(EXIT_MALFORMED_PARAMETER, "--oem has no argument.\n");
16971714
}
16981715
}
1716+
if (strcmp(argv[i], "--psm") == 0)
1717+
{
1718+
if (i < argc - 1)
1719+
{
1720+
i++;
1721+
1722+
char *str = (char *)malloc(sizeof(argv[i]));
1723+
sprintf(str, "%s", argv[i]);
1724+
opt->psm = atoi(str);
1725+
if (opt->psm < 0 || opt->psm > 13)
1726+
{
1727+
fatal(EXIT_MALFORMED_PARAMETER, "--psm must be between 0 and 13\n");
1728+
}
1729+
1730+
continue;
1731+
}
1732+
else
1733+
{
1734+
fatal(EXIT_MALFORMED_PARAMETER, "--psm has no argument.\n");
1735+
}
1736+
}
16991737
if (strcmp(argv[i], "--mkvlang") == 0)
17001738
{
17011739
if (i < argc - 1)

src/lib_ccx/params_dump.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,8 @@ void params_dump(struct lib_ccx_ctx *ctx)
216216
mprint("Reduced color palette]\n");
217217
break;
218218
}
219+
220+
mprint("[Tesseract PSM: %d]\n", ccx_options.psm);
219221
}
220222

221223
#define Y_N(cond) ((cond) ? "Yes" : "No")

src/rust/lib_ccxr/src/common/options.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,8 @@ pub struct Options {
455455
pub ocrlang: PathBuf,
456456
/// The Tesseract OEM mode, could be 0 (default), 1 or 2
457457
pub ocr_oem: i8,
458+
/// The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
459+
pub psm: i32,
458460
/// How to quantize the bitmap before passing to to tesseract
459461
/// (0 = no quantization at all, 1 = CCExtractor's internal,
460462
/// 2 = reduce distinct color count in image for faster results.)
@@ -589,6 +591,7 @@ impl Default for Options {
589591
dvblang: Default::default(),
590592
ocrlang: Default::default(),
591593
ocr_oem: -1,
594+
psm: 3,
592595
ocr_quantmode: 1,
593596
mkvlang: Default::default(),
594597
analyze_video_stream: Default::default(),

src/rust/src/args.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,25 @@ pub struct Args {
600600
/// Tesseract v4 : default mode is 1.
601601
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
602602
pub oem: Option<u8>,
603+
/// Select the PSM mode for Tesseract.
604+
/// Available Page segmentation modes:
605+
/// 0 Orientation and script detection (OSD) only.
606+
/// 1 Automatic page segmentation with OSD.
607+
/// 2 Automatic page segmentation, but no OSD, or OCR.
608+
/// 3 Fully automatic page segmentation, but no OSD. (Default)
609+
/// 4 Assume a single column of text of variable sizes.
610+
/// 5 Assume a single uniform block of vertically aligned text.
611+
/// 6 Assume a single uniform block of text.
612+
/// 7 Treat the image as a single text line.
613+
/// 8 Treat the image as a single word.
614+
/// 9 Treat the image as a single word in a circle.
615+
/// 10 Treat the image as a single character.
616+
/// 11 Sparse text. Find as much text as possible in no particular order.
617+
/// 12 Sparse text with OSD.
618+
/// 13 Raw line. Treat the image as a single text line,
619+
/// bypassing hacks that are Tesseract-specific.
620+
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
621+
pub psm: Option<u8>,
603622
/// For MKV subtitles, select which language's caption
604623
/// stream will be processed. e.g. 'eng' for English.
605624
/// Language codes can be either the 3 letters bibliographic

src/rust/src/parser.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,16 @@ impl OptionsExt for Options {
801801
self.ocr_oem = *oem as _;
802802
}
803803

804+
if let Some(ref psm) = args.psm {
805+
if !(0..=13).contains(psm) {
806+
fatal!(
807+
cause = ExitCause::MalformedParameter;
808+
"--psm must be between 0 and 13"
809+
);
810+
}
811+
self.psm = *psm as _;
812+
}
813+
804814
if let Some(ref lang) = args.mkvlang {
805815
self.mkvlang = Some(Language::from_str(lang.as_str()).unwrap());
806816
let str = lang.as_str();

0 commit comments

Comments
 (0)