Skip to content

Commit 633fd20

Browse files
Remove MuPDF dependency and refactor PDF processing logic
- Eliminated the `mupdf` dependency from `Cargo.toml` and `Cargo.lock`. - Refactored the `PdfProcessor` to exclusively use the `pdf_extract` library for text extraction, simplifying the code and improving maintainability. - Updated the PDF processing logic to remove the MuPDF backend implementation, ensuring a streamlined approach to PDF text extraction.
1 parent 48788a9 commit 633fd20

File tree

3 files changed

+1
-230
lines changed

3 files changed

+1
-230
lines changed

Cargo.lock

Lines changed: 0 additions & 193 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

processors/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ htmd = "0.1.6"
2323

2424
# PDF processing
2525
pdf-extract = {workspace = true}
26-
mupdf = "0.5.0"
2726
docx-parser = "0.1.1"
2827
pdf2image = "0.1.3"
2928
image = "0.25.6"

processors/src/pdf/pdf_processor.rs

Lines changed: 1 addition & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -47,42 +47,7 @@ impl FileProcessor for PdfProcessor {
4747
let tesseract_path = self.ocr_config.tesseract_path.as_deref();
4848
extract_text_with_ocr(&path, tesseract_path)?
4949
} else {
50-
match self.backend {
51-
PdfBackend::MuPdf => {
52-
let mut page_texts = Vec::new();
53-
{
54-
let document = mupdf::document::Document::open(path.as_ref())?;
55-
let pages = document.pages()?;
56-
57-
for (page_number, page_result) in pages.enumerate() {
58-
let page = page_result?;
59-
let text_page =
60-
page.to_text_page(mupdf::text_page::TextPageOptions::empty())?;
61-
62-
let mut page_text = String::new();
63-
for block in text_page.blocks() {
64-
for line in block.lines() {
65-
let chars: String =
66-
line.chars().map(|c| c.char().unwrap_or(' ')).collect();
67-
page_text.push_str(&chars);
68-
page_text.push('\n');
69-
}
70-
page_text.push('\n');
71-
}
72-
73-
page_texts.push((page_number, page_text));
74-
}
75-
}
76-
page_texts
77-
.into_iter()
78-
.map(|(_, text)| text)
79-
.collect::<Vec<String>>()
80-
.join("\n")
81-
}
82-
PdfBackend::LoPdf => {
83-
pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
84-
}
85-
}
50+
pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
8651
};
8752

8853
self.markdown_processor.process_document(&content)

0 commit comments

Comments
 (0)