Skip to content

Commit 2e76d2f

Browse files
authored
Move to_{html,xhtml,xml,text,json} methods from Page to TextPage (#143)
* Move `to_` methods from `Page` to `TextPage` * Fix test failures * Allow specifying whether to add header and trailer
1 parent 1d9add0 commit 2e76d2f

11 files changed

+367
-310
lines changed

examples/extract_images.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,21 @@
11
use std::io::Write;
22

3+
use mupdf::{Document, ImageFormat, TextPageFlags};
4+
35
fn main() -> Result<(), Box<dyn std::error::Error>> {
4-
let filename: String = std::env::args()
5-
.collect::<Vec<_>>()
6-
.get(1)
7-
.expect("missing filename")
8-
.to_owned();
9-
let document = mupdf::document::Document::open(&filename)?;
6+
let filename: String = std::env::args().nth(1).expect("missing filename");
7+
let document = Document::open(&filename)?;
108

119
let mut image_num: u32 = 0;
1210

1311
for page in document.pages()? {
14-
let text_page = page?.to_text_page(mupdf::text_page::TextPageOptions::PRESERVE_IMAGES)?;
12+
let text_page = page?.to_text_page(TextPageFlags::PRESERVE_IMAGES)?;
1513

1614
for block in text_page.blocks() {
1715
if let Some(image) = block.image() {
1816
let pixmap = image.to_pixmap()?;
1917
let mut bytes: Vec<u8> = vec![];
20-
pixmap.write_to(&mut bytes, mupdf::pixmap::ImageFormat::PNG)?;
18+
pixmap.write_to(&mut bytes, ImageFormat::PNG)?;
2119

2220
let mut output_file = std::fs::File::create(format!("output_{}.png", image_num))?;
2321
output_file.write_all(&bytes)?;

examples/extract_stext.rs

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,23 @@
1-
use std::io;
1+
use mupdf::{page::StextPage, Document, TextPageFlags};
22

3-
fn main() {
4-
// cargo run --example extract_stext
5-
let mut path_to_doc = String::new();
6-
println!("Enter a path to document: ");
7-
io::stdin()
8-
.read_line(&mut path_to_doc)
9-
.expect("Failed to read line");
10-
let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap();
11-
let page = doc.load_page(0).unwrap();
12-
match page.stext_page_as_json_from_page(1.0) {
13-
Ok(stext_json) => {
14-
let stext_page: serde_json::Result<mupdf::page::StextPage> =
15-
serde_json::from_str(stext_json.as_str());
16-
match stext_page {
17-
Ok(res) => {
18-
for block in res.blocks {
19-
if block.r#type.eq("text") {
20-
for line in block.lines {
21-
println!("{:?}", &line.text);
22-
}
23-
}
24-
}
25-
}
26-
Err(err) => {
27-
println!("stext_page parsing error: {:?}", &err);
3+
fn main() -> Result<(), Box<dyn std::error::Error>> {
4+
let filename: String = std::env::args().nth(1).expect("missing filename");
5+
let document = Document::open(&filename)?;
6+
7+
for page in document.pages()? {
8+
let text_page = page?.to_text_page(TextPageFlags::empty())?;
9+
10+
let json = text_page.to_json(1.0)?;
11+
let stext_page: StextPage = serde_json::from_str(json.as_str())?;
12+
13+
for block in stext_page.blocks {
14+
if block.r#type == "text" {
15+
for line in block.lines {
16+
println!("{:?}", &line.text);
2817
}
2918
}
3019
}
31-
Err(_err) => {}
3220
}
21+
22+
Ok(())
3323
}

examples/extract_text.rs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1+
use mupdf::{Document, TextPageFlags};
2+
13
fn main() -> Result<(), Box<dyn std::error::Error>> {
2-
let filename: String = std::env::args()
3-
.collect::<Vec<_>>()
4-
.get(1)
5-
.expect("missing filename")
6-
.to_owned();
7-
let document = mupdf::document::Document::open(&filename)?;
4+
let filename: String = std::env::args().nth(1).expect("missing filename");
5+
let document = Document::open(&filename)?;
86

97
for page in document.pages()? {
10-
let text_page = page?.to_text_page(mupdf::text_page::TextPageOptions::empty())?;
8+
let text_page = page?.to_text_page(TextPageFlags::empty())?;
119

1210
for block in text_page.blocks() {
1311
for line in block.lines() {

mupdf-sys/wrapper.c

Lines changed: 60 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -774,14 +774,12 @@ fz_buffer *mupdf_page_to_svg(fz_context *ctx, fz_page *page, fz_matrix ctm, fz_c
774774
return buf;
775775
}
776776

777-
fz_stext_page *mupdf_page_to_text_page(fz_context *ctx, fz_page *page, int flags, mupdf_error_t **errptr)
777+
fz_stext_page *mupdf_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options, mupdf_error_t **errptr)
778778
{
779779
fz_stext_page *text_page = NULL;
780-
fz_stext_options opts = {0};
781-
opts.flags = flags;
782780
fz_try(ctx)
783781
{
784-
text_page = fz_new_stext_page_from_page(ctx, page, &opts);
782+
text_page = fz_new_stext_page_from_page(ctx, page, options);
785783
}
786784
fz_catch(ctx)
787785
{
@@ -859,186 +857,139 @@ void mupdf_run_page_widgets(fz_context *ctx, fz_page *page, fz_device *device, f
859857
}
860858
}
861859

862-
fz_buffer *mupdf_page_to_html(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
863-
{
864-
fz_buffer *buf = NULL;
865-
fz_output *out = NULL;
866-
fz_stext_page *text = NULL;
867-
fz_var(text);
868-
fz_var(buf);
869-
fz_var(out);
860+
fz_output *mupdf_new_output_with_buffer(fz_context *ctx, fz_buffer *buf, mupdf_error_t **errptr) {
861+
fz_output* output;
870862
fz_try(ctx)
871863
{
872-
text = fz_new_stext_page_from_page(ctx, page, NULL);
873-
buf = fz_new_buffer(ctx, 8192);
874-
out = fz_new_output_with_buffer(ctx, buf);
875-
fz_print_stext_header_as_html(ctx, out);
876-
fz_print_stext_page_as_html(ctx, out, text, page->number);
877-
fz_print_stext_trailer_as_html(ctx, out);
878-
fz_close_output(ctx, out);
879-
}
880-
fz_always(ctx)
881-
{
882-
fz_drop_output(ctx, out);
883-
fz_drop_stext_page(ctx, text);
864+
output = fz_new_output_with_buffer(ctx, buf);
884865
}
885866
fz_catch(ctx)
886867
{
887868
mupdf_save_error(ctx, errptr);
888869
}
889-
return buf;
870+
return output;
890871
}
891872

892-
fz_buffer *mupdf_stext_page_as_json_from_page(fz_context *ctx, fz_page *page, float scale, mupdf_error_t **errptr)
873+
void mupdf_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr)
893874
{
894-
fz_buffer *buf = NULL;
895-
fz_output *out = NULL;
896-
fz_stext_page *stext_page = NULL;
897-
fz_var(stext_page);
898-
fz_var(buf);
899-
fz_var(out);
900875
fz_try(ctx)
901876
{
902-
stext_page = fz_new_stext_page_from_page(ctx, page, NULL);
903-
buf = fz_new_buffer(ctx, 8192);
904-
out = fz_new_output_with_buffer(ctx, buf);
905-
fz_print_stext_page_as_json(ctx, out, stext_page, scale);
906-
fz_close_output(ctx, out);
907-
}
908-
fz_always(ctx)
909-
{
910-
fz_drop_output(ctx, out);
911-
fz_drop_stext_page(ctx, stext_page);
877+
fz_print_stext_page_as_html(ctx, out, page, id);
912878
}
913879
fz_catch(ctx)
914880
{
915881
mupdf_save_error(ctx, errptr);
916882
}
917-
return buf;
918883
}
919884

920-
fz_buffer *mupdf_page_to_xhtml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
885+
void mupdf_print_stext_header_as_html(fz_context *ctx, fz_output *out, mupdf_error_t **errptr)
921886
{
922-
fz_buffer *buf = NULL;
923-
fz_output *out = NULL;
924-
fz_stext_page *text = NULL;
925-
fz_var(text);
926-
fz_var(buf);
927-
fz_var(out);
928887
fz_try(ctx)
929888
{
930-
text = fz_new_stext_page_from_page(ctx, page, NULL);
931-
buf = fz_new_buffer(ctx, 8192);
932-
out = fz_new_output_with_buffer(ctx, buf);
933-
fz_print_stext_header_as_xhtml(ctx, out);
934-
fz_print_stext_page_as_xhtml(ctx, out, text, page->number);
935-
fz_print_stext_trailer_as_xhtml(ctx, out);
936-
fz_close_output(ctx, out);
889+
fz_print_stext_header_as_html(ctx, out);
937890
}
938-
fz_always(ctx)
891+
fz_catch(ctx)
939892
{
940-
fz_drop_output(ctx, out);
941-
fz_drop_stext_page(ctx, text);
893+
mupdf_save_error(ctx, errptr);
894+
}
895+
}
896+
897+
void mupdf_print_stext_trailer_as_html(fz_context *ctx, fz_output *out, mupdf_error_t **errptr)
898+
{
899+
fz_try(ctx)
900+
{
901+
fz_print_stext_trailer_as_html(ctx, out);
942902
}
943903
fz_catch(ctx)
944904
{
945905
mupdf_save_error(ctx, errptr);
946906
}
947-
return buf;
948907
}
949908

950-
fz_buffer *mupdf_page_to_xml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
909+
void mupdf_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr)
951910
{
952-
fz_buffer *buf = NULL;
953-
fz_output *out = NULL;
954-
fz_stext_page *text = NULL;
955-
fz_var(text);
956-
fz_var(buf);
957-
fz_var(out);
958911
fz_try(ctx)
959912
{
960-
text = fz_new_stext_page_from_page(ctx, page, NULL);
961-
buf = fz_new_buffer(ctx, 8192);
962-
out = fz_new_output_with_buffer(ctx, buf);
963-
fz_print_stext_page_as_xml(ctx, out, text, page->number);
964-
fz_close_output(ctx, out);
913+
fz_print_stext_page_as_xhtml(ctx, out, page, id);
965914
}
966-
fz_always(ctx)
915+
fz_catch(ctx)
967916
{
968-
fz_drop_output(ctx, out);
969-
fz_drop_stext_page(ctx, text);
917+
mupdf_save_error(ctx, errptr);
918+
}
919+
}
920+
921+
void mupdf_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out, mupdf_error_t **errptr)
922+
{
923+
fz_try(ctx)
924+
{
925+
fz_print_stext_header_as_xhtml(ctx, out);
970926
}
971927
fz_catch(ctx)
972928
{
973929
mupdf_save_error(ctx, errptr);
974930
}
975-
return buf;
976931
}
977932

978-
fz_buffer *mupdf_page_to_text(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
933+
void mupdf_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out, mupdf_error_t **errptr)
979934
{
980-
fz_buffer *buf = NULL;
981-
fz_output *out = NULL;
982-
fz_stext_page *text = NULL;
983-
fz_var(text);
984-
fz_var(buf);
985-
fz_var(out);
986935
fz_try(ctx)
987936
{
988-
text = fz_new_stext_page_from_page(ctx, page, NULL);
989-
buf = fz_new_buffer(ctx, 8192);
990-
out = fz_new_output_with_buffer(ctx, buf);
991-
fz_print_stext_page_as_text(ctx, out, text);
992-
fz_close_output(ctx, out);
937+
fz_print_stext_trailer_as_xhtml(ctx, out);
993938
}
994-
fz_always(ctx)
939+
fz_catch(ctx)
995940
{
996-
fz_drop_output(ctx, out);
997-
fz_drop_stext_page(ctx, text);
941+
mupdf_save_error(ctx, errptr);
942+
}
943+
}
944+
945+
void mupdf_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr)
946+
{
947+
fz_try(ctx)
948+
{
949+
fz_print_stext_page_as_xml(ctx, out, page, id);
998950
}
999951
fz_catch(ctx)
1000952
{
1001953
mupdf_save_error(ctx, errptr);
1002954
}
1003-
return buf;
1004955
}
1005956

1006-
fz_link *mupdf_load_links(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
957+
void mupdf_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page, mupdf_error_t **errptr)
1007958
{
1008-
fz_link *link = NULL;
1009959
fz_try(ctx)
1010960
{
1011-
link = fz_load_links(ctx, page);
961+
fz_print_stext_page_as_text(ctx, out, page);
1012962
}
1013963
fz_catch(ctx)
1014964
{
1015965
mupdf_save_error(ctx, errptr);
1016966
}
1017-
return link;
1018967
}
1019968

1020-
fz_buffer *mupdf_stext_page_to_text(fz_context *ctx, fz_stext_page *page, mupdf_error_t **errptr)
969+
void mupdf_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale, mupdf_error_t **errptr)
1021970
{
1022-
fz_buffer *buf = NULL;
1023-
fz_output *out = NULL;
1024-
fz_var(buf);
1025-
fz_var(out);
1026971
fz_try(ctx)
1027972
{
1028-
buf = fz_new_buffer(ctx, 8192);
1029-
out = fz_new_output_with_buffer(ctx, buf);
1030-
fz_print_stext_page_as_text(ctx, out, page);
1031-
fz_close_output(ctx, out);
973+
fz_print_stext_page_as_json(ctx, out, page, scale);
1032974
}
1033-
fz_always(ctx)
975+
fz_catch(ctx)
1034976
{
1035-
fz_drop_output(ctx, out);
977+
mupdf_save_error(ctx, errptr);
978+
}
979+
}
980+
981+
fz_link *mupdf_load_links(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)
982+
{
983+
fz_link *link = NULL;
984+
fz_try(ctx)
985+
{
986+
link = fz_load_links(ctx, page);
1036987
}
1037988
fz_catch(ctx)
1038989
{
1039990
mupdf_save_error(ctx, errptr);
1040991
}
1041-
return buf;
992+
return link;
1042993
}
1043994

1044995
fz_separations *mupdf_page_separations(fz_context *ctx, fz_page *page, mupdf_error_t **errptr)

src/device.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use mupdf_sys::*;
66
use num_enum::TryFromPrimitive;
77

88
use crate::{
9-
context, ColorParams, Colorspace, DisplayList, Error, IRect, Image, Matrix, Path, Pixmap, Rect,
10-
Shade, StrokeState, Text, TextPage, TextPageOptions,
9+
context, ColorParams, Colorspace, DisplayList, Error, FFIWrapper, IRect, Image, Matrix, Path,
10+
Pixmap, Rect, Shade, StrokeState, Text, TextPage, TextPageFlags,
1111
};
1212

1313
mod native;
@@ -206,11 +206,11 @@ impl Device {
206206
})
207207
}
208208

209-
pub fn from_text_page(page: &TextPage, opts: TextPageOptions) -> Result<Self, Error> {
209+
pub fn from_text_page(page: &TextPage, opts: TextPageFlags) -> Result<Self, Error> {
210210
unsafe {
211211
ffi_try!(mupdf_new_stext_device(
212212
context(),
213-
page.inner,
213+
page.as_ptr().cast_mut(),
214214
opts.bits() as _
215215
))
216216
}

0 commit comments

Comments
 (0)