|
44 | 44 | # Now using '-oem 1' option to tesseract as there is an issue with default OS trained data
|
45 | 45 | # Fix: detection of files with unicode filenames
|
46 | 46 | # Fix: now uses current shell to spawn process, plain open3 where not using default shell env, needed for java progs
|
47 |
| -# 2.1.1 Fixed docker on Ubuntu 20.04 |
| 47 | +# 2.1.1 Fixed docker on Ubuntu 21.04 |
48 | 48 | # Color conversion to gray is off by default
|
49 | 49 | # Fix: new images format from INPE samples
|
| 50 | +# Fix: now uses default tesseract algorithms |
50 | 51 | #
|
51 | 52 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
|
52 | 53 | # would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
|
@@ -99,8 +100,8 @@ my $COLOR_THRES = .03; # Min color spread, below this value, will convert image
|
99 | 100 | # Command dependencies
|
100 | 101 |
|
101 | 102 | # depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
|
102 |
| -my $TESSERACT = 'tesseract --oem 1'; # if Tesseract => 4.0 |
103 |
| -#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 |
| 103 | +#my $TESSERACT = 'tesseract --oem 2'; # if Tesseract => 4.0 |
| 104 | +my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 |
104 | 105 |
|
105 | 106 | # Depends on pdftk 2.02 or higher
|
106 | 107 | my $PDFTK = 'pdftk';
|
@@ -142,6 +143,7 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados
|
142 | 143 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs
|
143 | 144 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
|
144 | 145 |
|
| 146 | +$ENV{'SHELL'} = exists $ENV{'SHELL'} ? $ENV{'SHELL'} : '/bin/bash'; |
145 | 147 | $ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
|
146 | 148 | $ENV{'IFS'} = '\t\n';
|
147 | 149 |
|
|
0 commit comments