Update version to 0.6.0 and enhance PDF processing capabilities

akshayballal95 · akshayballal95 · commit 577f7b10b008 · 2025-05-26T10:59:14.000+02:00
- Bumped the version of `embed_anything`, `embed_anything_python`, and `processors` to 0.6.0 in `Cargo.toml` and `Cargo.lock`.
- Introduced a new `pdf_backend` configuration option in `TextEmbedConfig` for selecting between `mupdf` and `lopdf` backends for PDF text extraction.
- Updated example usages to reflect the new `pdf_backend` parameter.
- Revised documentation to clarify the new features and improvements.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,7 +15,7 @@ description = "Embed anything at lightning speed"
 repository = "https://github.com/StarlightSearch/EmbedAnything"
 authors = ["Akshay Ballal <arballal95@gmail.com>"]
 exclude = ["test_files/*", "python", "*.py", "pyproject.toml", "examples/images/*", "mkdocs.yml", "docs/*", "tests/*", ".github", "Dockerfile", "docs"]
-version = "0.5.6"
+version = "0.6.0"
 
 [workspace.dependencies]
 pdf-extract = "0.7.7"
diff --git a/docs/blog/posts/ReleaseNotes5-5.md b/docs/blog/posts/ReleaseNotes5-5.md
@@ -11,7 +11,7 @@ Super Excited to share the latest development in our library, which essentially
 
 <!-- more -->
 
-## Support for late chunking.
+## Late Chunking
 
 The new 0.5.6 version adds Late Chunking to EmbedAnything, a technique introduced by Jina AI and Weaviate. 
 Here's how we've implemented Late Chunking in EA:
@@ -46,7 +46,7 @@ data: list[EmbedData] = model.embed_file("test_files/attention.pdf", config=conf
 ```
 
 
-## 𝘊𝘰𝘩𝘦𝘳𝘦 𝘌𝘮𝘣𝘦𝘥 4:
+## Cohere Embed 4:
 
 🧊 Single embedding per document, even for multimodal inputs
 📚 Handles up to 128K tokens – perfect for long-form business documents
@@ -77,11 +77,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_hf(
 
 ## Processor Crate:
 
-This crate contains various "processors" that accept files/folders/bytes and produced a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation!
-
-
-
-
+This crate contains various "processors" that accepts files and produces a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation! This also supports PDF text extraction via two different backends at the moment: MuPDF and LoPDF. You can choose between the two by either passing `pdf_backend` as `mupdf` or `lopdf` in  `TextEmbedConfig` 
 
 We have also received some additional cool feature requests on GitHub, which we would like to implement. If you want to help out please check out EmbedAnything on GitHub. We would love to have a contribution. 🚀
 
diff --git a/examples/text.py b/examples/text.py
@@ -13,7 +13,7 @@
 def embed_directory_example():
     # Configure the embedding process
     config = TextEmbedConfig(
-        chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence"
+        chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence", pdf_backend="lopdf"
     )
 
     # Start timing
@@ -73,7 +73,7 @@ def embed_file_example():
 # Example 4: Embed files in a batch
 def embed_files_batch_example():
     
-    config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64)
+    config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64, pdf_backend="mupdf")
 
     data = model.embed_files_batch(["test_files/bank.txt", "test_files/test.pdf"])
 
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
 
 [dependencies]
 embed_anything = {path = "../rust", features = ["ort"]}
+processors = {path = "../processors"}
 pyo3 = { version = "0.23.2"}
 tokio = { version = "1.39.0", features = ["rt-multi-thread"]}
 strum =  {workspace = true}
diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi
@@ -502,6 +502,7 @@ class TextEmbedConfig:
         semantic_encoder: The semantic encoder for the Text Embedding model. Default is None.
         use_ocr: A flag indicating whether to use OCR for the Text Embedding model. Default is False.
         tesseract_path: The path to the Tesseract OCR executable. Default is None and uses the system path.
+        pdf_backend: The backend to use for PDF text extraction. Options are `mupdf` and `lopdf`. Default is `lopdf`.
     """
 
     def __init__(
@@ -515,6 +516,7 @@ class TextEmbedConfig:
         semantic_encoder: EmbeddingModel | None = None,
         use_ocr: bool | None = False,
         tesseract_path: str | None = None,
+        pdf_backend: str | None = "lopdf",
     ):
         self.chunk_size = chunk_size
         self.overlap_ratio = overlap_ratio
@@ -525,6 +527,7 @@ class TextEmbedConfig:
         self.semantic_encoder = semantic_encoder
         self.use_ocr = use_ocr
         self.tesseract_path = tesseract_path
+        self.pdf_backend = pdf_backend
     chunk_size: int | None
     overlap_ratio: float | None
     batch_size: int | None
@@ -534,6 +537,7 @@ class TextEmbedConfig:
     semantic_encoder: EmbeddingModel | None
     use_ocr: bool | None
     tesseract_path: str | None
+    pdf_backend: str | None
 
 class ImageEmbedConfig:
     """
diff --git a/python/src/config.rs b/python/src/config.rs
@@ -1,6 +1,7 @@
 use crate::EmbeddingModel;
 use embed_anything::config::SplittingStrategy;
 use pyo3::prelude::*;
+use processors::pdf::pdf_processor::PdfBackend;
 
 #[pyclass]
 #[derive(Default)]
@@ -12,7 +13,7 @@ pub struct TextEmbedConfig {
 #[pymethods]
 impl TextEmbedConfig {
     #[new]
-    #[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None))]
+    #[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None, pdf_backend=None))]
     pub fn new(
         chunk_size: Option<usize>,
         batch_size: Option<usize>,
@@ -23,7 +24,19 @@ impl TextEmbedConfig {
         semantic_encoder: Option<&EmbeddingModel>,
         use_ocr: Option<bool>,
         tesseract_path: Option<&str>,
+        pdf_backend: Option<&str>,
     ) -> Self {
+        let pdf_backend = match pdf_backend {
+            Some(backend) => {
+                match backend {
+                    "mupdf" => PdfBackend::MuPdf,
+                    "lopdf" => PdfBackend::LoPdf,
+                    _ => panic!("Unknown PDF backend provided!"),
+                }
+            }
+            None => PdfBackend::LoPdf,
+        };
+
         let strategy = match splitting_strategy {
             Some(strategy) => {
                 match strategy {
@@ -49,7 +62,8 @@ impl TextEmbedConfig {
                 .with_buffer_size(buffer_size.unwrap_or(100))
                 .with_splitting_strategy(strategy)
                 .with_late_chunking(late_chunking.unwrap_or(false))
-                .with_ocr(use_ocr.unwrap_or(false), tesseract_path),
+                .with_ocr(use_ocr.unwrap_or(false), tesseract_path)
+                .with_pdf_backend(pdf_backend),
         }
     }