Skip to content

Commit 348976a

Browse files
authored
Merge pull request #1 from gchehab/master
Versão 2.1.1
2 parents 11eab5e + 087c50d commit 348976a

File tree

7 files changed

+322
-157
lines changed

7 files changed

+322
-157
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data/*

Dockerfile

Lines changed: 18 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,31 @@
11

2-
FROM ubuntu:14.04
3-
4-
# Cópia de arquivos do projeto OCR-SERVER
5-
COPY usr/local/bin/ocr /usr/local/bin/ocr
6-
COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
7-
COPY entrypoint.sh /entrypoint.sh
2+
FROM ubuntu:21.04
83

94
WORKDIR /tmp
105

6+
ENV TZ=Etc/UTC
7+
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
8+
119
# Instalação dos pacotes pré-requisitos do ocr-server 2
1210
RUN apt-get -y update && \
13-
apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
14-
curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
15-
zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
16-
wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
17-
libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
18-
libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
19-
rsyslog libsys-syslog-perl && \
20-
apt-get -y clean all
21-
22-
RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
23-
dpkg -i mscorefonts.deb && \
24-
rm mscorefonts.deb
25-
26-
# Instalação do Perl 5.1 e demais módulos
27-
RUN perl -MCPAN -e 'install File::Touch'
28-
RUN perl -MCPAN -e 'install File::Find::Rule;'
29-
RUN perl -MCPAN -e 'install File::Touch;'
30-
RUN perl -MCPAN -e 'install Sys::Syslog;'
31-
RUN perl -MCPAN -e 'install IPC::Open3;'
32-
RUN perl -MCPAN -e 'install IO::Select;'
33-
34-
# Tesseract-ocr 3.05, com dicionários inglês e português
35-
# Bibliotecas para o Tesseract: Leptonica
36-
RUN git clone https://github.com/DanBloomberg/leptonica.git && \
37-
cd leptonica && ./autobuild && ./configure && make all install && \
38-
rm -rf ../leptonica
39-
40-
# Bibliotecas para o Tesseract: Libav
41-
RUN git clone https://github.com/libav/libav.git && \
42-
export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
43-
cd libav && ./configure --enable-sram && make all install && \
44-
rm -rf ../libav
11+
apt-get install -y tesseract-ocr tesseract-ocr-por tesseract-ocr-eng tesseract-ocr-spa leptonica-progs \
12+
poppler-utils pdftk unpaper ocaml ghostscript imagemagick libcamlpdf-ocaml rsyslog \
13+
wget perl libfile-find-rule-perl libfile-touch-perl libunix-syslog-perl
4514

46-
# Tesseract 3.05.01
47-
RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
48-
cd tesseract && ./autogen.sh && ./configure && make all install && \
49-
rm -rf ../tesseract
15+
RUN wget \
16+
https://raw.githubusercontent.com/coherentgraphics/cpdf-binaries/master/Linux-Intel-64bit/cpdf \
17+
-O /usr/local/bin/cpdf && \
18+
chmod 755 /usr/local/bin/cpdf
5019

51-
RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
52-
wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
53-
wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
54-
55-
# Poppler 0.56
56-
RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
57-
cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \
58-
rm -rf ../poppler
59-
60-
# pdftk, versão 2.02 ou superior
61-
RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
62-
unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
63-
cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
64-
rm -rf ../pdftk-2.02-dist
65-
66-
# Ghostscript 9.18 ou superior
67-
RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
68-
tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
69-
cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
70-
rm -rf ../ghostscript-9.18
71-
72-
# CPDF Intel OS X v 2.2
73-
RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
74-
cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
75-
76-
# Atualização das configurações do ld
77-
RUN ldconfig
20+
RUN update-rc.d rsyslog defaults
7821

7922
RUN useradd -m ocr
8023

24+
# Cópia de arquivos do projeto OCR-SERVER
25+
COPY usr/local/bin/ocr /usr/local/bin/ocr
26+
COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
27+
COPY entrypoint.sh /entrypoint.sh
28+
8129
RUN chmod +x /usr/local/bin/ocr && \
8230
chmod +x /etc/init.d/ocr && \
8331
update-rc.d ocr defaults
@@ -100,4 +48,4 @@ WORKDIR /
10048

10149
VOLUME /var/ocr-server/
10250

103-
CMD ["bash", "/entrypoint.sh"]
51+
CMD ["bash", "/entrypoint.sh"]

Dockerfile.old

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
2+
FROM ubuntu:14.04
3+
4+
# Cópia de arquivos do projeto OCR-SERVER
5+
COPY usr/local/bin/ocr /usr/local/bin/ocr
6+
COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
7+
COPY entrypoint.sh /entrypoint.sh
8+
9+
WORKDIR /tmp
10+
11+
# Instalação dos pacotes pré-requisitos do ocr-server 2
12+
RUN apt-get -y update && \
13+
apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
14+
curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
15+
zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
16+
wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
17+
libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
18+
libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
19+
rsyslog libsys-syslog-perl && \
20+
apt-get -y clean all
21+
22+
RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
23+
dpkg -i mscorefonts.deb && \
24+
rm mscorefonts.deb
25+
26+
# Instalação do Perl 5.1 e demais módulos
27+
RUN perl -MCPAN -e 'install File::Touch'
28+
RUN perl -MCPAN -e 'install File::Find::Rule;'
29+
RUN perl -MCPAN -e 'install File::Touch;'
30+
RUN perl -MCPAN -e 'install Sys::Syslog;'
31+
RUN perl -MCPAN -e 'install IPC::Open3;'
32+
RUN perl -MCPAN -e 'install IO::Select;'
33+
34+
# Tesseract-ocr 3.05, com dicionários inglês e português
35+
# Bibliotecas para o Tesseract: Leptonica
36+
RUN git clone https://github.com/DanBloomberg/leptonica.git && \
37+
cd leptonica && ./autobuild && ./configure && make all install && \
38+
rm -rf ../leptonica
39+
40+
# Bibliotecas para o Tesseract: Libav
41+
RUN git clone https://github.com/libav/libav.git && \
42+
export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
43+
cd libav && ./configure --enable-sram && make all install && \
44+
rm -rf ../libav
45+
46+
# Tesseract 3.05.01
47+
RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
48+
cd tesseract && ./autogen.sh && ./configure && make all install && \
49+
rm -rf ../tesseract
50+
51+
RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
52+
wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
53+
wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
54+
55+
# Poppler 0.56
56+
RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
57+
cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \
58+
rm -rf ../poppler
59+
60+
# pdftk, versão 2.02 ou superior
61+
RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
62+
unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
63+
cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
64+
rm -rf ../pdftk-2.02-dist
65+
66+
# Ghostscript 9.18 ou superior
67+
RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
68+
tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
69+
cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
70+
rm -rf ../ghostscript-9.18
71+
72+
# CPDF Intel OS X v 2.2
73+
RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
74+
cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
75+
76+
# Atualização das configurações do ld
77+
RUN ldconfig
78+
79+
RUN useradd -m ocr
80+
81+
RUN chmod +x /usr/local/bin/ocr && \
82+
chmod +x /etc/init.d/ocr && \
83+
update-rc.d ocr defaults
84+
85+
RUN mkdir /var/ocr-server/ && \
86+
mkdir -p /var/ocr-server/Entrada && \
87+
mkdir -p /var/ocr-server/Saida && \
88+
mkdir -p /var/ocr-server/Originais_Processados && \
89+
mkdir -p /var/ocr-server/Erro && \
90+
chmod +x /entrypoint.sh
91+
92+
RUN mkdir -p /tmp/ocr_dev/ && \
93+
mkdir -p /tmp/ocr_dev/Entrada && \
94+
mkdir -p /tmp/ocr_dev/Saida && \
95+
mkdir -p /tmp/ocr_dev/Originais_Processados && \
96+
mkdir -p /tmp/ocr_dev/Erro && \
97+
chmod -R 777 /tmp/ocr_dev
98+
99+
WORKDIR /
100+
101+
VOLUME /var/ocr-server/
102+
103+
CMD ["bash", "/entrypoint.sh"]

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ Author: Guilherme Chehab
3939
- Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
4040
- Fix: create subpaths on error folder
4141
- Fix: trying to reduce overhead on temporary folder
42+
- 2.1/2.1.1
43+
- Filter pipelines: disabled by default
44+
- Color reduction: disabled by default
45+
- Stock Ubuntu 20.x docker
4246
4347
## TODO:
4448
- Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
@@ -102,15 +106,15 @@ O OCR-Server também está disponível como um container Docker, permitindo o r
102106

103107
Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando:
104108

105-
docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server
109+
docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server gchehab/ocr-server
106110

107111
Onde:
108112
--name : Nome atribuído à instância do container. Ex: ocr-server
109113
-d : Indicação executar o container em background
110114
-v : Diretório de compartilhamento entre o servidor host e o container.
111115
O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos.
112116

113-
Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando:
117+
Para visualizar os logs de processamento do serviço, basta executar o seguinte comando:
114118

115119
docker logs <NOME_CONTAINER>
116120

entrypoint.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

33
# Inicializa serviço de log
4-
/etc/init.d/rsyslog start
4+
service rsyslog start
55

66
# Cria estrutura de pastas para monitoramento de arquivos
77
mkdir -p /var/ocr-server/
@@ -14,4 +14,7 @@ chmod -R 777 /var/ocr-server
1414
# Iniciar serviço do OCR-Server
1515
service ocr start
1616

17-
tail -f /var/log/syslog
17+
while [ 1 ]; do
18+
tail -f /var/log/syslog
19+
sleep 1;
20+
done

etc/init.d/ocr-ubuntu

100644100755
File mode changed.

0 commit comments

Comments
 (0)