From 40e55a4c2eccf8f5e79b43919f5ea441bc1d49f2 Mon Sep 17 00:00:00 2001 From: ginnyTheCat Date: Mon, 26 May 2025 05:28:33 +0200 Subject: [PATCH 1/3] Move `to_` methods from `Page` to `TextPage` --- examples/extract_images.rs | 14 ++- examples/extract_stext.rs | 44 ++++----- examples/extract_text.rs | 12 +-- mupdf-sys/wrapper.c | 155 ++++++------------------------- src/device.rs | 8 +- src/display_list.rs | 17 ++-- src/lib.rs | 4 +- src/output.rs | 29 ++++++ src/page.rs | 116 +++++------------------ src/text_page.rs | 186 ++++++++++++++++++++++++++++++------- tests/test_issues.rs | 19 ++-- 11 files changed, 296 insertions(+), 308 deletions(-) create mode 100644 src/output.rs diff --git a/examples/extract_images.rs b/examples/extract_images.rs index 51afc23..70aa713 100644 --- a/examples/extract_images.rs +++ b/examples/extract_images.rs @@ -1,23 +1,21 @@ use std::io::Write; +use mupdf::{Document, ImageFormat, TextPageFlags}; + fn main() -> Result<(), Box> { - let filename: String = std::env::args() - .collect::>() - .get(1) - .expect("missing filename") - .to_owned(); - let document = mupdf::document::Document::open(&filename)?; + let filename: String = std::env::args().nth(1).expect("missing filename"); + let document = Document::open(&filename)?; let mut image_num: u32 = 0; for page in document.pages()? { - let text_page = page?.to_text_page(mupdf::text_page::TextPageOptions::PRESERVE_IMAGES)?; + let text_page = page?.to_text_page(TextPageFlags::PRESERVE_IMAGES)?; for block in text_page.blocks() { if let Some(image) = block.image() { let pixmap = image.to_pixmap()?; let mut bytes: Vec = vec![]; - pixmap.write_to(&mut bytes, mupdf::pixmap::ImageFormat::PNG)?; + pixmap.write_to(&mut bytes, ImageFormat::PNG)?; let mut output_file = std::fs::File::create(format!("output_{}.png", image_num))?; output_file.write_all(&bytes)?; diff --git a/examples/extract_stext.rs b/examples/extract_stext.rs index 52508d8..f6f577c 100644 --- a/examples/extract_stext.rs +++ b/examples/extract_stext.rs @@ -1,33 +1,25 @@ use std::io; -fn main() { - // cargo run --example extract_stext - let mut path_to_doc = String::new(); - println!("Enter a path to document: "); - io::stdin() - .read_line(&mut path_to_doc) - .expect("Failed to read line"); - let doc = mupdf::document::Document::open(path_to_doc.trim()).unwrap(); - let page = doc.load_page(0).unwrap(); - match page.stext_page_as_json_from_page(1.0) { - Ok(stext_json) => { - let stext_page: serde_json::Result = - serde_json::from_str(stext_json.as_str()); - match stext_page { - Ok(res) => { - for block in res.blocks { - if block.r#type.eq("text") { - for line in block.lines { - println!("{:?}", &line.text); - } - } - } - } - Err(err) => { - println!("stext_page parsing error: {:?}", &err); +use mupdf::{page::StextPage, Document, TextPageFlags}; + +fn main() -> Result<(), Box> { + let filename: String = std::env::args().nth(1).expect("missing filename"); + let document = Document::open(&filename)?; + + for page in document.pages()? { + let text_page = page?.to_text_page(TextPageFlags::empty())?; + + let json = text_page.to_json(1.0)?; + let stext_page: StextPage = serde_json::from_str(json.as_str())?; + + for block in stext_page.blocks { + if block.r#type == "text" { + for line in block.lines { + println!("{:?}", &line.text); } } } - Err(_err) => {} } + + Ok(()) } diff --git a/examples/extract_text.rs b/examples/extract_text.rs index 5db52d5..c37dbf3 100644 --- a/examples/extract_text.rs +++ b/examples/extract_text.rs @@ -1,13 +1,11 @@ +use mupdf::{Document, TextPageFlags}; + fn main() -> Result<(), Box> { - let filename: String = std::env::args() - .collect::>() - .get(1) - .expect("missing filename") - .to_owned(); - let document = mupdf::document::Document::open(&filename)?; + let filename: String = std::env::args().nth(1).expect("missing filename"); + let document = Document::open(&filename)?; for page in document.pages()? { - let text_page = page?.to_text_page(mupdf::text_page::TextPageOptions::empty())?; + let text_page = page?.to_text_page(TextPageFlags::empty())?; for block in text_page.blocks() { for line in block.lines() { diff --git a/mupdf-sys/wrapper.c b/mupdf-sys/wrapper.c index e7f9c70..3c696fd 100644 --- a/mupdf-sys/wrapper.c +++ b/mupdf-sys/wrapper.c @@ -774,14 +774,12 @@ fz_buffer *mupdf_page_to_svg(fz_context *ctx, fz_page *page, fz_matrix ctm, fz_c return buf; } -fz_stext_page *mupdf_page_to_text_page(fz_context *ctx, fz_page *page, int flags, mupdf_error_t **errptr) +fz_stext_page *mupdf_new_stext_page_from_page(fz_context *ctx, fz_page *page, const fz_stext_options *options, mupdf_error_t **errptr) { fz_stext_page *text_page = NULL; - fz_stext_options opts = {0}; - opts.flags = flags; fz_try(ctx) { - text_page = fz_new_stext_page_from_page(ctx, page, &opts); + text_page = fz_new_stext_page_from_page(ctx, page, options); } fz_catch(ctx) { @@ -859,148 +857,77 @@ void mupdf_run_page_widgets(fz_context *ctx, fz_page *page, fz_device *device, f } } -fz_buffer *mupdf_page_to_html(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) +fz_output *mupdf_new_output_with_buffer(fz_context *ctx, fz_buffer *buf, mupdf_error_t **errptr) { + fz_output* output; + fz_try(ctx) + { + output = fz_new_output_with_buffer(ctx, buf); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } + return output; +} + +void mupdf_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_stext_page *text = NULL; - fz_var(text); - fz_var(buf); - fz_var(out); - fz_try(ctx) - { - text = fz_new_stext_page_from_page(ctx, page, NULL); - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_header_as_html(ctx, out); - fz_print_stext_page_as_html(ctx, out, text, page->number); - fz_print_stext_trailer_as_html(ctx, out); - fz_close_output(ctx, out); - } - fz_always(ctx) + fz_try(ctx) { - fz_drop_output(ctx, out); - fz_drop_stext_page(ctx, text); + fz_print_stext_page_as_html(ctx, out, page, id); } fz_catch(ctx) { mupdf_save_error(ctx, errptr); } - return buf; } -fz_buffer *mupdf_stext_page_as_json_from_page(fz_context *ctx, fz_page *page, float scale, mupdf_error_t **errptr) +void mupdf_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_stext_page *stext_page = NULL; - fz_var(stext_page); - fz_var(buf); - fz_var(out); - fz_try(ctx) - { - stext_page = fz_new_stext_page_from_page(ctx, page, NULL); - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_page_as_json(ctx, out, stext_page, scale); - fz_close_output(ctx, out); - } - fz_always(ctx) + fz_try(ctx) { - fz_drop_output(ctx, out); - fz_drop_stext_page(ctx, stext_page); + fz_print_stext_page_as_xhtml(ctx, out, page, id); } fz_catch(ctx) { mupdf_save_error(ctx, errptr); } - return buf; } -fz_buffer *mupdf_page_to_xhtml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) +void mupdf_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_stext_page *text = NULL; - fz_var(text); - fz_var(buf); - fz_var(out); - fz_try(ctx) - { - text = fz_new_stext_page_from_page(ctx, page, NULL); - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_header_as_xhtml(ctx, out); - fz_print_stext_page_as_xhtml(ctx, out, text, page->number); - fz_print_stext_trailer_as_xhtml(ctx, out); - fz_close_output(ctx, out); - } - fz_always(ctx) + fz_try(ctx) { - fz_drop_output(ctx, out); - fz_drop_stext_page(ctx, text); + fz_print_stext_page_as_xml(ctx, out, page, id); } fz_catch(ctx) { mupdf_save_error(ctx, errptr); } - return buf; } -fz_buffer *mupdf_page_to_xml(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) +void mupdf_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page, mupdf_error_t **errptr) { - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_stext_page *text = NULL; - fz_var(text); - fz_var(buf); - fz_var(out); - fz_try(ctx) - { - text = fz_new_stext_page_from_page(ctx, page, NULL); - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_page_as_xml(ctx, out, text, page->number); - fz_close_output(ctx, out); - } - fz_always(ctx) + fz_try(ctx) { - fz_drop_output(ctx, out); - fz_drop_stext_page(ctx, text); + fz_print_stext_page_as_text(ctx, out, page); } fz_catch(ctx) { mupdf_save_error(ctx, errptr); } - return buf; } -fz_buffer *mupdf_page_to_text(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) +void mupdf_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale, mupdf_error_t **errptr) { - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_stext_page *text = NULL; - fz_var(text); - fz_var(buf); - fz_var(out); fz_try(ctx) { - text = fz_new_stext_page_from_page(ctx, page, NULL); - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_page_as_text(ctx, out, text); - fz_close_output(ctx, out); - } - fz_always(ctx) - { - fz_drop_output(ctx, out); - fz_drop_stext_page(ctx, text); + fz_print_stext_page_as_json(ctx, out, page, scale); } fz_catch(ctx) { mupdf_save_error(ctx, errptr); } - return buf; } fz_link *mupdf_load_links(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) @@ -1017,30 +944,6 @@ fz_link *mupdf_load_links(fz_context *ctx, fz_page *page, mupdf_error_t **errptr return link; } -fz_buffer *mupdf_stext_page_to_text(fz_context *ctx, fz_stext_page *page, mupdf_error_t **errptr) -{ - fz_buffer *buf = NULL; - fz_output *out = NULL; - fz_var(buf); - fz_var(out); - fz_try(ctx) - { - buf = fz_new_buffer(ctx, 8192); - out = fz_new_output_with_buffer(ctx, buf); - fz_print_stext_page_as_text(ctx, out, page); - fz_close_output(ctx, out); - } - fz_always(ctx) - { - fz_drop_output(ctx, out); - } - fz_catch(ctx) - { - mupdf_save_error(ctx, errptr); - } - return buf; -} - fz_separations *mupdf_page_separations(fz_context *ctx, fz_page *page, mupdf_error_t **errptr) { fz_separations *seps = NULL; diff --git a/src/device.rs b/src/device.rs index 78340ae..9aa076d 100644 --- a/src/device.rs +++ b/src/device.rs @@ -6,8 +6,8 @@ use mupdf_sys::*; use num_enum::TryFromPrimitive; use crate::{ - context, ColorParams, Colorspace, DisplayList, Error, IRect, Image, Matrix, Path, Pixmap, Rect, - Shade, StrokeState, Text, TextPage, TextPageOptions, + context, ColorParams, Colorspace, DisplayList, Error, FFIWrapper, IRect, Image, Matrix, Path, + Pixmap, Rect, Shade, StrokeState, Text, TextPage, TextPageFlags, }; mod native; @@ -206,11 +206,11 @@ impl Device { }) } - pub fn from_text_page(page: &TextPage, opts: TextPageOptions) -> Result { + pub fn from_text_page(page: &TextPage, opts: TextPageFlags) -> Result { unsafe { ffi_try!(mupdf_new_stext_device( context(), - page.inner, + page.as_ptr().cast_mut(), opts.bits() as _ )) } diff --git a/src/display_list.rs b/src/display_list.rs index e58ea97..e8ea83e 100644 --- a/src/display_list.rs +++ b/src/display_list.rs @@ -1,10 +1,10 @@ -use std::ffi::CString; +use std::{ffi::CString, ptr::NonNull}; use mupdf_sys::*; use crate::{ array::FzArray, context, rust_vec_from_ffi_ptr, Colorspace, Cookie, Device, Error, Image, - Matrix, Pixmap, Quad, Rect, TextPage, TextPageOptions, + Matrix, Pixmap, Quad, Rect, TextPage, TextPageFlags, }; #[derive(Debug)] @@ -40,15 +40,18 @@ impl DisplayList { .map(|inner| unsafe { Pixmap::from_raw(inner) }) } - pub fn to_text_page(&self, opts: TextPageOptions) -> Result { - unsafe { + pub fn to_text_page(&self, opts: TextPageFlags) -> Result { + let inner = unsafe { ffi_try!(mupdf_display_list_to_text_page( context(), self.inner, opts.bits() as _ - )) - } - .map(|inner| unsafe { TextPage::from_raw(inner) }) + ))? + }; + + let inner = unsafe { NonNull::new_unchecked(inner) }; + + Ok(TextPage { inner }) } pub fn to_image(&self, width: f32, height: f32) -> Result { diff --git a/src/lib.rs b/src/lib.rs index 94c57d0..068642e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,6 +38,8 @@ pub mod link; pub mod matrix; /// Outline pub mod outline; +/// Output +pub mod output; /// Document page pub mod page; /// Path type @@ -107,7 +109,7 @@ pub use shade::Shade; pub use size::Size; pub use stroke_state::{LineCap, LineJoin, StrokeState}; pub use text::{Text, TextItem, TextSpan}; -pub use text_page::{TextBlock, TextChar, TextLine, TextPage, TextPageOptions}; +pub use text_page::{TextBlock, TextChar, TextLine, TextPage, TextPageFlags}; use core::{marker::PhantomData, ptr::NonNull}; use zerocopy::{FromBytes, IntoBytes}; diff --git a/src/output.rs b/src/output.rs new file mode 100644 index 0000000..f8944f9 --- /dev/null +++ b/src/output.rs @@ -0,0 +1,29 @@ +use std::ptr::NonNull; + +use mupdf_sys::{fz_close_output, fz_drop_output, fz_new_output_with_buffer, fz_output}; + +use crate::{context, Buffer}; + +pub struct Output { + pub(crate) inner: NonNull, +} + +impl Drop for Output { + fn drop(&mut self) { + unsafe { + fz_close_output(context(), self.inner.as_ptr()); + fz_drop_output(context(), self.inner.as_ptr()); + } + } +} + +impl Output { + pub fn from_buffer(buf: &Buffer) -> Self { + let inner = unsafe { fz_new_output_with_buffer(context(), buf.inner) }; + + // SAFETY: fz_new_output_with_buffer never returns NULL + let inner = unsafe { NonNull::new_unchecked(inner) }; + + Self { inner } + } +} diff --git a/src/page.rs b/src/page.rs index 77e75d5..b30e492 100644 --- a/src/page.rs +++ b/src/page.rs @@ -8,7 +8,7 @@ use crate::array::FzArray; use crate::{ context, rust_vec_from_ffi_ptr, unsafe_impl_ffi_wrapper, Buffer, Colorspace, Cookie, Device, DisplayList, Error, FFIWrapper, Link, Matrix, Pixmap, Quad, Rect, Separations, TextPage, - TextPageOptions, + TextPageFlags, }; #[derive(Debug)] @@ -96,15 +96,29 @@ impl Page { Ok(svg) } - pub fn to_text_page(&self, opts: TextPageOptions) -> Result { - unsafe { - ffi_try!(mupdf_page_to_text_page( + pub fn to_text_page(&self, flags: TextPageFlags) -> Result { + let opts = fz_stext_options { + flags: flags.bits() as _, + scale: 0.0, + clip: fz_rect { + x0: 0.0, + y0: 0.0, + x1: 0.0, + y1: 0.0, + }, + }; + + let inner = unsafe { + ffi_try!(mupdf_new_stext_page_from_page( context(), - self.as_ptr() as *mut _, - opts.bits() as _ - )) - } - .map(|inner| unsafe { TextPage::from_raw(inner) }) + self.as_ptr().cast_mut(), + &opts + ))? + }; + + let inner = unsafe { NonNull::new_unchecked(inner) }; + + Ok(TextPage { inner }) } pub fn to_display_list(&self, annotations: bool) -> Result { @@ -234,52 +248,6 @@ impl Page { } } - pub fn to_html(&self) -> Result { - let inner = unsafe { ffi_try!(mupdf_page_to_html(context(), self.as_ptr() as *mut _)) }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut out = String::new(); - buf.read_to_string(&mut out)?; - Ok(out) - } - - pub fn stext_page_as_json_from_page(&self, scale: f32) -> Result { - let inner = unsafe { - ffi_try!(mupdf_stext_page_as_json_from_page( - context(), - self.as_ptr() as *mut _, - scale - )) - }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut res = String::new(); - buf.read_to_string(&mut res)?; - Ok(res) - } - - pub fn to_xhtml(&self) -> Result { - let inner = unsafe { ffi_try!(mupdf_page_to_xhtml(context(), self.as_ptr() as *mut _)) }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut out = String::new(); - buf.read_to_string(&mut out)?; - Ok(out) - } - - pub fn to_xml(&self) -> Result { - let inner = unsafe { ffi_try!(mupdf_page_to_xml(context(), self.as_ptr() as *mut _)) }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut out = String::new(); - buf.read_to_string(&mut out)?; - Ok(out) - } - - pub fn to_text(&self) -> Result { - let inner = unsafe { ffi_try!(mupdf_page_to_text(context(), self.as_ptr() as *mut _)) }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut out = String::new(); - buf.read_to_string(&mut out)?; - Ok(out) - } - pub fn links(&self) -> Result { unsafe { ffi_try!(mupdf_load_links(context(), self.as_ptr() as *mut _)) }.map(|next| { LinkIter { @@ -446,38 +414,6 @@ mod test { assert!(!svg.is_empty()); } - #[test] - fn test_page_to_html() { - let doc = test_document!("..", "files/dummy.pdf").unwrap(); - let page0 = doc.load_page(0).unwrap(); - let html = page0.to_html().unwrap(); - assert!(!html.is_empty()); - } - - #[test] - fn test_page_to_xhtml() { - let doc = test_document!("..", "files/dummy.pdf").unwrap(); - let page0 = doc.load_page(0).unwrap(); - let xhtml = page0.to_xhtml().unwrap(); - assert!(!xhtml.is_empty()); - } - - #[test] - fn test_page_to_xml() { - let doc = test_document!("..", "files/dummy.pdf").unwrap(); - let page0 = doc.load_page(0).unwrap(); - let xml = page0.to_xml().unwrap(); - assert!(!xml.is_empty()); - } - - #[test] - fn test_page_to_text() { - let doc = test_document!("..", "files/dummy.pdf").unwrap(); - let page0 = doc.load_page(0).unwrap(); - let text = page0.to_text().unwrap(); - assert!(!text.is_empty()); - } - #[test] fn test_page_to_display_list() { let doc = test_document!("..", "files/dummy.pdf").unwrap(); @@ -488,13 +424,11 @@ mod test { #[test] fn test_page_to_text_page() { - use crate::TextPageOptions; + use crate::TextPageFlags; let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page0 = doc.load_page(0).unwrap(); - let _tp = page0 - .to_text_page(TextPageOptions::PRESERVE_IMAGES) - .unwrap(); + let _tp = page0.to_text_page(TextPageFlags::PRESERVE_IMAGES).unwrap(); } #[test] diff --git a/src/text_page.rs b/src/text_page.rs index e11f227..49a4d89 100644 --- a/src/text_page.rs +++ b/src/text_page.rs @@ -11,45 +11,143 @@ use bitflags::bitflags; use mupdf_sys::*; use num_enum::TryFromPrimitive; -use crate::FFIAnalogue; use crate::{ - context, rust_slice_to_ffi_ptr, Buffer, Error, Image, Matrix, Point, Quad, Rect, WriteMode, + context, rust_slice_to_ffi_ptr, unsafe_impl_ffi_wrapper, Buffer, Error, FFIWrapper, Image, + Matrix, Point, Quad, Rect, WriteMode, }; +use crate::{output::Output, FFIAnalogue}; bitflags! { /// Options for creating a pixmap and draw device. - pub struct TextPageOptions: u32 { - const BLOCK_IMAGE = FZ_STEXT_BLOCK_IMAGE as _; - const BLOCK_TEXT = FZ_STEXT_BLOCK_TEXT as _; - const INHIBIT_SPACES = FZ_STEXT_INHIBIT_SPACES as _; - const PRESERVE_IMAGES = FZ_STEXT_PRESERVE_IMAGES as _; + pub struct TextPageFlags: u32 { const PRESERVE_LIGATURES = FZ_STEXT_PRESERVE_LIGATURES as _; const PRESERVE_WHITESPACE = FZ_STEXT_PRESERVE_WHITESPACE as _; + const PRESERVE_IMAGES = FZ_STEXT_PRESERVE_IMAGES as _; + const INHIBIT_SPACES = FZ_STEXT_INHIBIT_SPACES as _; + const DEHYPHENATE = FZ_STEXT_DEHYPHENATE as _; + const PRESERVE_SPANS = FZ_STEXT_PRESERVE_SPANS as _; + const CLIP = FZ_STEXT_CLIP as _; + const USE_CID_FOR_UNKNOWN_UNICODE = FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE as _; + const COLLECT_STRUCTURE = FZ_STEXT_COLLECT_STRUCTURE as _; + const ACCURATE_BBOXES = FZ_STEXT_ACCURATE_BBOXES as _; + const COLLECT_VECTORS = FZ_STEXT_COLLECT_VECTORS as _; + const IGNORE_ACTUALTEXT = FZ_STEXT_IGNORE_ACTUALTEXT as _; + const SEGMENT = FZ_STEXT_SEGMENT as _; + const PARAGRAPH_BREAK = FZ_STEXT_PARAGRAPH_BREAK as _; + const TABLE_HUNT = FZ_STEXT_TABLE_HUNT as _; + const COLLECT_STYLES = FZ_STEXT_COLLECT_STYLES as _; + const USE_GID_FOR_UNKNOWN_UNICODE = FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE as _; + const ACCURATE_ASCENDERS = FZ_STEXT_ACCURATE_ASCENDERS as _; + const ACCURATE_SIDE_BEARINGS = FZ_STEXT_ACCURATE_SIDE_BEARINGS as _; } } /// A text page is a list of blocks, together with an overall bounding box #[derive(Debug)] pub struct TextPage { - pub(crate) inner: *mut fz_stext_page, + pub(crate) inner: NonNull, } +unsafe_impl_ffi_wrapper!(TextPage, fz_stext_page, fz_drop_stext_page); + impl TextPage { - pub(crate) unsafe fn from_raw(ptr: *mut fz_stext_page) -> Self { - Self { inner: ptr } + pub fn to_html(&self, id: i32) -> Result { + let mut buf = Buffer::with_capacity(8192); + + let out = Output::from_buffer(&buf); + unsafe { + ffi_try!(mupdf_print_stext_page_as_html( + context(), + out.inner.as_ptr(), + self.inner.as_ptr(), + id + ))? + }; + drop(out); + + let mut res = String::new(); + buf.read_to_string(&mut res)?; + Ok(res) + } + + pub fn to_xhtml(&self, id: i32) -> Result { + let mut buf = Buffer::with_capacity(8192); + + let out = Output::from_buffer(&buf); + unsafe { + ffi_try!(mupdf_print_stext_page_as_xhtml( + context(), + out.inner.as_ptr(), + self.inner.as_ptr(), + id + ))? + }; + drop(out); + + let mut res = String::new(); + buf.read_to_string(&mut res)?; + Ok(res) + } + + pub fn to_xml(&self, id: i32) -> Result { + let mut buf = Buffer::with_capacity(8192); + + let out = Output::from_buffer(&buf); + unsafe { + ffi_try!(mupdf_print_stext_page_as_xml( + context(), + out.inner.as_ptr(), + self.inner.as_ptr(), + id + ))? + }; + drop(out); + + let mut res = String::new(); + buf.read_to_string(&mut res)?; + Ok(res) } pub fn to_text(&self) -> Result { - let inner = unsafe { ffi_try!(mupdf_stext_page_to_text(context(), self.inner)) }?; - let mut buf = unsafe { Buffer::from_raw(inner) }; - let mut text = String::new(); - buf.read_to_string(&mut text)?; - Ok(text) + let mut buf = Buffer::with_capacity(8192); + + let out = Output::from_buffer(&buf); + unsafe { + ffi_try!(mupdf_print_stext_page_as_text( + context(), + out.inner.as_ptr(), + self.inner.as_ptr() + ))? + }; + drop(out); + + let mut res = String::new(); + buf.read_to_string(&mut res)?; + Ok(res) + } + + pub fn to_json(&self, scale: f32) -> Result { + let mut buf = Buffer::with_capacity(8192); + + let out = Output::from_buffer(&buf); + unsafe { + ffi_try!(mupdf_print_stext_page_as_json( + context(), + out.inner.as_ptr(), + self.inner.as_ptr(), + scale + ))? + }; + drop(out); + + let mut res = String::new(); + buf.read_to_string(&mut res)?; + Ok(res) } pub fn blocks(&self) -> TextBlockIter { TextBlockIter { - next: unsafe { (*self.inner).first_block }, + next: unsafe { (*self.as_ptr().cast_mut()).first_block }, _marker: PhantomData, } } @@ -152,7 +250,7 @@ impl TextPage { unsafe { ffi_try!(mupdf_search_stext_page_cb( context(), - self.inner, + self.as_ptr().cast_mut(), c_needle.as_ptr(), Some(ffi_cb::), &raw mut opaque as *mut c_void @@ -172,7 +270,7 @@ impl TextPage { unsafe { ffi_try!(mupdf_highlight_selection( context(), - self.inner, + self.as_mut_ptr(), a.into(), b.into(), ptr as *mut fz_quad, @@ -182,16 +280,6 @@ impl TextPage { } } -impl Drop for TextPage { - fn drop(&mut self) { - if !self.inner.is_null() { - unsafe { - fz_drop_stext_page(context(), self.inner); - } - } - } -} - #[repr(i32)] pub enum SearchHitResponse { ContinueSearch = 0, @@ -362,7 +450,43 @@ impl<'a> Iterator for TextCharIter<'a> { #[cfg(test)] mod test { - use crate::{document::test_document, text_page::SearchHitResponse, Document, TextPageOptions}; + use crate::{document::test_document, text_page::SearchHitResponse, Document, TextPageFlags}; + + #[test] + fn test_page_to_html() { + let doc = test_document!("..", "files/dummy.pdf").unwrap(); + let page0 = doc.load_page(0).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); + let html = text_page.to_html(0).unwrap(); + assert!(html.contains("Dummy PDF file")); + } + + #[test] + fn test_page_to_xhtml() { + let doc = test_document!("..", "files/dummy.pdf").unwrap(); + let page0 = doc.load_page(0).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); + let xhtml = text_page.to_xhtml(0).unwrap(); + assert!(xhtml.contains("Dummy PDF file")); + } + + #[test] + fn test_page_to_xml() { + let doc = test_document!("..", "files/dummy.pdf").unwrap(); + let page0 = doc.load_page(0).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); + let xml = text_page.to_xml(0).unwrap(); + assert!(xml.contains("Dummy PDF file")); + } + + #[test] + fn test_page_to_text() { + let doc = test_document!("..", "files/dummy.pdf").unwrap(); + let page0 = doc.load_page(0).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); + let text = text_page.to_text().unwrap(); + assert_eq!(text, "Dummy PDF file"); + } #[test] fn test_text_page_search() { @@ -370,7 +494,7 @@ mod test { let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page0 = doc.load_page(0).unwrap(); - let text_page = page0.to_text_page(TextPageOptions::BLOCK_IMAGE).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); let hits = text_page.search("Dummy").unwrap(); assert_eq!(hits.len(), 1); assert_eq!( @@ -403,7 +527,7 @@ mod test { fn test_text_page_cb_search() { let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page0 = doc.load_page(0).unwrap(); - let text_page = page0.to_text_page(TextPageOptions::BLOCK_IMAGE).unwrap(); + let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); let mut sum_x = 0.0; let num_hits = text_page .search_cb("Dummy", &mut sum_x, |acc, hits| { diff --git a/tests/test_issues.rs b/tests/test_issues.rs index 1e6b10b..711cf5c 100644 --- a/tests/test_issues.rs +++ b/tests/test_issues.rs @@ -1,5 +1,5 @@ use mupdf::pdf::PdfDocument; -use mupdf::{Error, TextPageOptions}; +use mupdf::{Error, TextPageFlags}; #[cfg(not(target_arch = "wasm32"))] #[test] @@ -21,7 +21,7 @@ fn test_issue_27_flatten() { let pages = doc .pages() .unwrap() - .map(|page| page?.to_text_page(TextPageOptions::PRESERVE_LIGATURES)) + .map(|page| page?.to_text_page(TextPageFlags::PRESERVE_LIGATURES)) .collect::, Error>>() .unwrap(); // The original code from the issue doesn't compile anymore since `pages` is required to hold @@ -76,11 +76,13 @@ fn test_issue_86_invalid_utf8() { .unwrap(); for (idx, page) in doc.pages().unwrap().enumerate() { let page = page.unwrap(); - let text = page.to_text(); + let text_page = page.to_text_page(TextPageFlags::empty()).unwrap(); + + let text = text_page.to_text(); assert!(text.is_ok()); println!("page: {idx}, text: {}", text.unwrap()); - let json = page.stext_page_as_json_from_page(1.0); + let json = text_page.to_json(1.0); assert!(json.is_ok()); // Validate JSON parsing @@ -95,11 +97,13 @@ fn test_issue_i32_box() { let doc = PdfDocument::from_bytes(include_bytes!("../tests/files/i32-box.pdf")).unwrap(); for (idx, page) in doc.pages().unwrap().enumerate() { let page = page.unwrap(); - let text = page.to_text(); + let text_page = page.to_text_page(TextPageFlags::empty()).unwrap(); + + let text = text_page.to_text(); assert!(text.is_ok()); println!("page: {idx}, text: {}", text.unwrap()); - let json = page.stext_page_as_json_from_page(1.0); + let json = text_page.to_json(1.0); assert!(json.is_ok()); let stext_page: Result = @@ -112,6 +116,7 @@ fn test_issue_i32_box() { fn test_issue_no_json() { let doc = PdfDocument::from_bytes(include_bytes!("../tests/files/no-json.pdf")).unwrap(); let page = doc.load_page(0).unwrap(); - let json = page.stext_page_as_json_from_page(1.0); + let text_page = page.to_text_page(TextPageFlags::empty()).unwrap(); + let json = text_page.to_json(1.0); assert!(json.is_err()); } From 02ee9b26189418ff24996f601050bbbf4f035a72 Mon Sep 17 00:00:00 2001 From: ginnyTheCat Date: Mon, 26 May 2025 06:14:38 +0200 Subject: [PATCH 2/3] Fix test failures --- examples/extract_stext.rs | 2 -- src/page.rs | 27 +++++++++------------------ src/text_page.rs | 2 +- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/examples/extract_stext.rs b/examples/extract_stext.rs index f6f577c..9143e4d 100644 --- a/examples/extract_stext.rs +++ b/examples/extract_stext.rs @@ -1,5 +1,3 @@ -use std::io; - use mupdf::{page::StextPage, Document, TextPageFlags}; fn main() -> Result<(), Box> { diff --git a/src/page.rs b/src/page.rs index b30e492..a3f8580 100644 --- a/src/page.rs +++ b/src/page.rs @@ -383,26 +383,17 @@ mod test { fn test_get_stext_page_as_json() { let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page = doc.load_page(0).unwrap(); - match page.stext_page_as_json_from_page(1.0) { - Ok(stext_json) => { - let stext_page: serde_json::Result = - serde_json::from_str(stext_json.as_str()); - match stext_page { - Ok(res) => { - for block in res.blocks { - if block.r#type.eq("text") { - for line in block.lines { - assert_eq!(&line.text, &"Dummy PDF file".to_string()); - } - } - } - } - Err(err) => { - println!("stext_page parsing error: {:?}", &err); - } + let text_page = page.to_text_page(crate::TextPageFlags::empty()).unwrap(); + + let json = text_page.to_json(1.0).unwrap(); + let stext_page: crate::page::StextPage = serde_json::from_str(json.as_str()).unwrap(); + + for block in stext_page.blocks { + if block.r#type == "text" { + for line in block.lines { + assert_eq!(&line.text, &"Dummy PDF file".to_string()); } } - Err(_err) => {} } } diff --git a/src/text_page.rs b/src/text_page.rs index 49a4d89..a0bfca9 100644 --- a/src/text_page.rs +++ b/src/text_page.rs @@ -485,7 +485,7 @@ mod test { let page0 = doc.load_page(0).unwrap(); let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); let text = text_page.to_text().unwrap(); - assert_eq!(text, "Dummy PDF file"); + assert_eq!(text, "Dummy PDF file\n\n"); } #[test] From b9a3eefb44579fb96e9086584ffdde279e7e2cef Mon Sep 17 00:00:00 2001 From: ginnyTheCat Date: Mon, 26 May 2025 16:28:42 +0200 Subject: [PATCH 3/3] Allow specifying whether to add header and trailer --- mupdf-sys/wrapper.c | 74 +++++++++++++++++++++++++++++++++++++-------- src/text_page.rs | 40 +++++++++++++++++++++--- 2 files changed, 97 insertions(+), 17 deletions(-) diff --git a/mupdf-sys/wrapper.c b/mupdf-sys/wrapper.c index 3c696fd..1943e03 100644 --- a/mupdf-sys/wrapper.c +++ b/mupdf-sys/wrapper.c @@ -858,21 +858,21 @@ void mupdf_run_page_widgets(fz_context *ctx, fz_page *page, fz_device *device, f } fz_output *mupdf_new_output_with_buffer(fz_context *ctx, fz_buffer *buf, mupdf_error_t **errptr) { - fz_output* output; - fz_try(ctx) - { - output = fz_new_output_with_buffer(ctx, buf); - } - fz_catch(ctx) - { + fz_output* output; + fz_try(ctx) + { + output = fz_new_output_with_buffer(ctx, buf); + } + fz_catch(ctx) + { mupdf_save_error(ctx, errptr); - } - return output; + } + return output; } void mupdf_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_try(ctx) + fz_try(ctx) { fz_print_stext_page_as_html(ctx, out, page, id); } @@ -882,9 +882,33 @@ void mupdf_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_pa } } +void mupdf_print_stext_header_as_html(fz_context *ctx, fz_output *out, mupdf_error_t **errptr) +{ + fz_try(ctx) + { + fz_print_stext_header_as_html(ctx, out); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } +} + +void mupdf_print_stext_trailer_as_html(fz_context *ctx, fz_output *out, mupdf_error_t **errptr) +{ + fz_try(ctx) + { + fz_print_stext_trailer_as_html(ctx, out); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } +} + void mupdf_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_try(ctx) + fz_try(ctx) { fz_print_stext_page_as_xhtml(ctx, out, page, id); } @@ -894,9 +918,33 @@ void mupdf_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_p } } +void mupdf_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out, mupdf_error_t **errptr) +{ + fz_try(ctx) + { + fz_print_stext_header_as_xhtml(ctx, out); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } +} + +void mupdf_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out, mupdf_error_t **errptr) +{ + fz_try(ctx) + { + fz_print_stext_trailer_as_xhtml(ctx, out); + } + fz_catch(ctx) + { + mupdf_save_error(ctx, errptr); + } +} + void mupdf_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id, mupdf_error_t **errptr) { - fz_try(ctx) + fz_try(ctx) { fz_print_stext_page_as_xml(ctx, out, page, id); } @@ -908,7 +956,7 @@ void mupdf_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_pag void mupdf_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page, mupdf_error_t **errptr) { - fz_try(ctx) + fz_try(ctx) { fz_print_stext_page_as_text(ctx, out, page); } diff --git a/src/text_page.rs b/src/text_page.rs index a0bfca9..9471b83 100644 --- a/src/text_page.rs +++ b/src/text_page.rs @@ -51,10 +51,18 @@ pub struct TextPage { unsafe_impl_ffi_wrapper!(TextPage, fz_stext_page, fz_drop_stext_page); impl TextPage { - pub fn to_html(&self, id: i32) -> Result { + pub fn to_html(&self, id: i32, full: bool) -> Result { let mut buf = Buffer::with_capacity(8192); let out = Output::from_buffer(&buf); + if full { + unsafe { + ffi_try!(mupdf_print_stext_header_as_html( + context(), + out.inner.as_ptr() + ))? + }; + } unsafe { ffi_try!(mupdf_print_stext_page_as_html( context(), @@ -63,6 +71,14 @@ impl TextPage { id ))? }; + if full { + unsafe { + ffi_try!(mupdf_print_stext_trailer_as_html( + context(), + out.inner.as_ptr() + ))? + }; + } drop(out); let mut res = String::new(); @@ -75,13 +91,21 @@ impl TextPage { let out = Output::from_buffer(&buf); unsafe { + ffi_try!(mupdf_print_stext_header_as_xhtml( + context(), + out.inner.as_ptr() + ))?; ffi_try!(mupdf_print_stext_page_as_xhtml( context(), out.inner.as_ptr(), self.inner.as_ptr(), id - ))? - }; + ))?; + ffi_try!(mupdf_print_stext_trailer_as_html( + context(), + out.inner.as_ptr() + ))?; + } drop(out); let mut res = String::new(); @@ -457,7 +481,13 @@ mod test { let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page0 = doc.load_page(0).unwrap(); let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); - let html = text_page.to_html(0).unwrap(); + + let html = text_page.to_html(0, false).unwrap(); + assert!(!html.starts_with("")); + assert!(html.contains("Dummy PDF file")); + + let html = text_page.to_html(0, true).unwrap(); + assert!(html.starts_with("")); assert!(html.contains("Dummy PDF file")); } @@ -466,7 +496,9 @@ mod test { let doc = test_document!("..", "files/dummy.pdf").unwrap(); let page0 = doc.load_page(0).unwrap(); let text_page = page0.to_text_page(TextPageFlags::empty()).unwrap(); + let xhtml = text_page.to_xhtml(0).unwrap(); + assert!(xhtml.starts_with("