Skip to content

Commit fd96274

Browse files
authored
Add initial OCR support in document_writer (#141)
1 parent 9ef9144 commit fd96274

File tree

8 files changed

+104
-19
lines changed

8 files changed

+104
-19
lines changed

.github/workflows/CI.yml

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ jobs:
4242
- run: sudo apt-get -y install libfontconfig1-dev
4343
if: matrix.os == 'ubuntu-latest'
4444

45+
- name: Download tesseract training data
46+
run: curl -LO https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
47+
4548
- run: cargo test --features serde
4649

4750
- name: Test package mupdf-sys
@@ -90,9 +93,14 @@ jobs:
9093
with:
9194
submodules: "recursive"
9295
fetch-depth: 500
93-
- run: |
94-
rustc --version --verbose
95-
cargo test --features serde
96+
97+
- name: Download tesseract training data
98+
run: curl -LO https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
99+
100+
- run: rustc --version --verbose
101+
- run: cargo test --features serde
102+
env:
103+
CARGO_TERM_COLOR: always
96104

97105
asan:
98106
name: Address Sanitizer
@@ -105,10 +113,14 @@ jobs:
105113
- uses: dtolnay/rust-toolchain@nightly
106114
with:
107115
components: rust-src
116+
108117
- run: sudo apt-get -y install libfontconfig1-dev llvm
118+
119+
- name: Download tesseract training data
120+
run: curl -LO https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
121+
109122
- name: cargo test --features serde
110-
run: |
111-
cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --features serde
123+
run: cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --features serde
112124
env:
113125
RUSTFLAGS: -Zsanitizer=address
114126
LSAN_OPTIONS: report_objects=1:suppressions=lsan_suppressions.txt
@@ -123,7 +135,12 @@ jobs:
123135
fetch-depth: 500
124136
- uses: dtolnay/rust-toolchain@stable
125137
- uses: taiki-e/install-action@valgrind
138+
126139
- run: sudo apt-get -y install libfontconfig1-dev
140+
141+
- name: Download tesseract training data
142+
run: curl -LO https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
143+
127144
- run: cargo test --features serde
128145
env:
129146
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER: "valgrind --error-exitcode=1 --track-origins=yes"

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ target
22
Cargo.lock
33
**/*.rs.bk
44
/build
5-
tests/output/*.png
5+
tests/output/*

mupdf-sys/wrapper.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3227,6 +3227,20 @@ fz_document_writer *mupdf_new_document_writer(fz_context *ctx, const char *filen
32273227
return writer;
32283228
}
32293229

3230+
fz_document_writer *mupdf_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options, mupdf_error_t **errptr)
3231+
{
3232+
fz_document_writer *writer = NULL;
3233+
fz_try(ctx)
3234+
{
3235+
writer = fz_new_pdfocr_writer(ctx, path, options);
3236+
}
3237+
fz_catch(ctx)
3238+
{
3239+
mupdf_save_error(ctx, errptr);
3240+
}
3241+
return writer;
3242+
}
3243+
32303244
fz_device *mupdf_document_writer_begin_page(fz_context *ctx, fz_document_writer *writer, fz_rect mediabox, mupdf_error_t **errptr)
32313245
{
32323246
fz_device *device = NULL;

src/document_writer.rs

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,20 @@ use std::ptr;
33

44
use mupdf_sys::*;
55

6-
use crate::{context, Device, Error, Rect};
6+
use crate::{context, Device, Error, FilePath, Rect};
77

88
#[derive(Debug)]
99
pub struct DocumentWriter {
1010
inner: *mut fz_document_writer,
1111
}
1212

1313
impl DocumentWriter {
14-
pub fn new(filename: &str, format: &str, options: &str) -> Result<Self, Error> {
15-
let c_filename = CString::new(filename)?;
14+
pub fn new<P: AsRef<FilePath> + ?Sized>(
15+
filename: &P,
16+
format: &str,
17+
options: &str,
18+
) -> Result<Self, Error> {
19+
let c_filename = CString::new(filename.as_ref().as_bytes())?;
1620
let c_format = CString::new(format)?;
1721
let c_options = CString::new(options)?;
1822
unsafe {
@@ -26,6 +30,21 @@ impl DocumentWriter {
2630
.map(|inner| Self { inner })
2731
}
2832

33+
#[cfg(feature = "tesseract")]
34+
pub fn with_ocr<P: AsRef<FilePath> + ?Sized>(path: &P, options: &str) -> Result<Self, Error> {
35+
let c_path = CString::new(path.as_ref().as_bytes())?;
36+
let c_options = CString::new(options)?;
37+
38+
unsafe {
39+
ffi_try!(mupdf_new_pdfocr_writer(
40+
context(),
41+
c_path.as_ptr(),
42+
c_options.as_ptr()
43+
))
44+
}
45+
.map(|inner| Self { inner })
46+
}
47+
2948
pub fn begin_page(&mut self, media_box: Rect) -> Result<Device, Error> {
3049
unsafe {
3150
ffi_try!(mupdf_document_writer_begin_page(
@@ -57,3 +76,47 @@ impl Drop for DocumentWriter {
5776
}
5877
}
5978
}
79+
80+
#[cfg(not(target_arch = "wasm32"))]
81+
#[cfg(test)]
82+
mod test {
83+
use crate::{pdf::PdfDocument, ColorParams, Image, Matrix, Rect};
84+
85+
use super::DocumentWriter;
86+
87+
#[test]
88+
fn test_writer_ocr() {
89+
let output = "tests/output/ocr.pdf";
90+
91+
{
92+
let mut writer = DocumentWriter::with_ocr(output, "").unwrap();
93+
94+
let image = Image::from_file("tests/files/ocr.png").unwrap();
95+
let width = image.width() as f32;
96+
let height = image.height() as f32;
97+
98+
let device = writer
99+
.begin_page(Rect {
100+
x0: 0.0,
101+
y0: 0.0,
102+
x1: width,
103+
y1: height,
104+
})
105+
.unwrap();
106+
device
107+
.fill_image(
108+
&image,
109+
&Matrix::new_scale(width, height),
110+
1.0,
111+
ColorParams::default(),
112+
)
113+
.unwrap();
114+
writer.end_page(device).unwrap();
115+
}
116+
117+
let doc = PdfDocument::open(output).unwrap();
118+
let page = doc.load_page(0).unwrap();
119+
let res = page.search("A short OCR test", 0).unwrap();
120+
assert_eq!(res.len(), 1);
121+
}
122+
}

tests/files/i32-box.pdf

100755100644
File mode changed.

tests/files/no-json.pdf

100755100644
File mode changed.

tests/files/ocr.png

18.7 KB
Loading

tests/test_issues.rs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,6 @@ fn test_issue_27_flatten() {
3636
#[cfg(not(target_arch = "wasm32"))]
3737
#[test]
3838
fn test_issue_43_malloc() {
39-
const IDENTITY: mupdf::Matrix = mupdf::Matrix {
40-
a: 1.0,
41-
b: 0.0,
42-
c: 0.0,
43-
d: 1.0,
44-
e: 0.0,
45-
f: 0.0,
46-
};
47-
4839
let density = 300;
4940
let height = 1500;
5041
let options = format!("resolution={},height={}", density, height);
@@ -57,7 +48,7 @@ fn test_issue_43_malloc() {
5748
let page0 = doc.load_page(0).unwrap();
5849
let mediabox = page0.bounds().unwrap();
5950
let device = writer.begin_page(mediabox).unwrap();
60-
page0.run(&device, &IDENTITY).unwrap();
51+
page0.run(&device, &mupdf::Matrix::IDENTITY).unwrap();
6152
writer.end_page(device).unwrap();
6253
}
6354
}

0 commit comments

Comments
 (0)