Skip to content

Commit 577f7b1

Browse files
Update version to 0.6.0 and enhance PDF processing capabilities
- Bumped the version of `embed_anything`, `embed_anything_python`, and `processors` to 0.6.0 in `Cargo.toml` and `Cargo.lock`. - Introduced a new `pdf_backend` configuration option in `TextEmbedConfig` for selecting between `mupdf` and `lopdf` backends for PDF text extraction. - Updated example usages to reflect the new `pdf_backend` parameter. - Revised documentation to clarify the new features and improvements.
1 parent 5f60d35 commit 577f7b1

File tree

7 files changed

+31
-15
lines changed

7 files changed

+31
-15
lines changed

Cargo.lock

Lines changed: 4 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ description = "Embed anything at lightning speed"
1515
repository = "https://github.com/StarlightSearch/EmbedAnything"
1616
authors = ["Akshay Ballal <arballal95@gmail.com>"]
1717
exclude = ["test_files/*", "python", "*.py", "pyproject.toml", "examples/images/*", "mkdocs.yml", "docs/*", "tests/*", ".github", "Dockerfile", "docs"]
18-
version = "0.5.6"
18+
version = "0.6.0"
1919

2020
[workspace.dependencies]
2121
pdf-extract = "0.7.7"

docs/blog/posts/ReleaseNotes5-5.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Super Excited to share the latest development in our library, which essentially
1111

1212
<!-- more -->
1313

14-
## Support for late chunking.
14+
## Late Chunking
1515

1616
The new 0.5.6 version adds Late Chunking to EmbedAnything, a technique introduced by Jina AI and Weaviate.
1717
Here's how we've implemented Late Chunking in EA:
@@ -46,7 +46,7 @@ data: list[EmbedData] = model.embed_file("test_files/attention.pdf", config=conf
4646
```
4747

4848

49-
## 𝘊𝘰𝘩𝘦𝘳𝘦 𝘌𝘮𝘣𝘦𝘥 4:
49+
## Cohere Embed 4:
5050

5151
🧊 Single embedding per document, even for multimodal inputs
5252
📚 Handles up to 128K tokens – perfect for long-form business documents
@@ -77,11 +77,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_hf(
7777

7878
## Processor Crate:
7979

80-
This crate contains various "processors" that accept files/folders/bytes and produced a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation!
81-
82-
83-
84-
80+
This crate contains various "processors" that accepts files and produces a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation! This also supports PDF text extraction via two different backends at the moment: MuPDF and LoPDF. You can choose between the two by either passing `pdf_backend` as `mupdf` or `lopdf` in `TextEmbedConfig`
8581

8682
We have also received some additional cool feature requests on GitHub, which we would like to implement. If you want to help out please check out EmbedAnything on GitHub. We would love to have a contribution. 🚀
8783

examples/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
def embed_directory_example():
1414
# Configure the embedding process
1515
config = TextEmbedConfig(
16-
chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence"
16+
chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence", pdf_backend="lopdf"
1717
)
1818

1919
# Start timing
@@ -73,7 +73,7 @@ def embed_file_example():
7373
# Example 4: Embed files in a batch
7474
def embed_files_batch_example():
7575

76-
config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64)
76+
config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64, pdf_backend="mupdf")
7777

7878
data = model.embed_files_batch(["test_files/bank.txt", "test_files/test.pdf"])
7979

python/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
99

1010
[dependencies]
1111
embed_anything = {path = "../rust", features = ["ort"]}
12+
processors = {path = "../processors"}
1213
pyo3 = { version = "0.23.2"}
1314
tokio = { version = "1.39.0", features = ["rt-multi-thread"]}
1415
strum = {workspace = true}

python/python/embed_anything/_embed_anything.pyi

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ class TextEmbedConfig:
502502
semantic_encoder: The semantic encoder for the Text Embedding model. Default is None.
503503
use_ocr: A flag indicating whether to use OCR for the Text Embedding model. Default is False.
504504
tesseract_path: The path to the Tesseract OCR executable. Default is None and uses the system path.
505+
pdf_backend: The backend to use for PDF text extraction. Options are `mupdf` and `lopdf`. Default is `lopdf`.
505506
"""
506507

507508
def __init__(
@@ -515,6 +516,7 @@ class TextEmbedConfig:
515516
semantic_encoder: EmbeddingModel | None = None,
516517
use_ocr: bool | None = False,
517518
tesseract_path: str | None = None,
519+
pdf_backend: str | None = "lopdf",
518520
):
519521
self.chunk_size = chunk_size
520522
self.overlap_ratio = overlap_ratio
@@ -525,6 +527,7 @@ class TextEmbedConfig:
525527
self.semantic_encoder = semantic_encoder
526528
self.use_ocr = use_ocr
527529
self.tesseract_path = tesseract_path
530+
self.pdf_backend = pdf_backend
528531
chunk_size: int | None
529532
overlap_ratio: float | None
530533
batch_size: int | None
@@ -534,6 +537,7 @@ class TextEmbedConfig:
534537
semantic_encoder: EmbeddingModel | None
535538
use_ocr: bool | None
536539
tesseract_path: str | None
540+
pdf_backend: str | None
537541

538542
class ImageEmbedConfig:
539543
"""

python/src/config.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::EmbeddingModel;
22
use embed_anything::config::SplittingStrategy;
33
use pyo3::prelude::*;
4+
use processors::pdf::pdf_processor::PdfBackend;
45

56
#[pyclass]
67
#[derive(Default)]
@@ -12,7 +13,7 @@ pub struct TextEmbedConfig {
1213
#[pymethods]
1314
impl TextEmbedConfig {
1415
#[new]
15-
#[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None))]
16+
#[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None, pdf_backend=None))]
1617
pub fn new(
1718
chunk_size: Option<usize>,
1819
batch_size: Option<usize>,
@@ -23,7 +24,19 @@ impl TextEmbedConfig {
2324
semantic_encoder: Option<&EmbeddingModel>,
2425
use_ocr: Option<bool>,
2526
tesseract_path: Option<&str>,
27+
pdf_backend: Option<&str>,
2628
) -> Self {
29+
let pdf_backend = match pdf_backend {
30+
Some(backend) => {
31+
match backend {
32+
"mupdf" => PdfBackend::MuPdf,
33+
"lopdf" => PdfBackend::LoPdf,
34+
_ => panic!("Unknown PDF backend provided!"),
35+
}
36+
}
37+
None => PdfBackend::LoPdf,
38+
};
39+
2740
let strategy = match splitting_strategy {
2841
Some(strategy) => {
2942
match strategy {
@@ -49,7 +62,8 @@ impl TextEmbedConfig {
4962
.with_buffer_size(buffer_size.unwrap_or(100))
5063
.with_splitting_strategy(strategy)
5164
.with_late_chunking(late_chunking.unwrap_or(false))
52-
.with_ocr(use_ocr.unwrap_or(false), tesseract_path),
65+
.with_ocr(use_ocr.unwrap_or(false), tesseract_path)
66+
.with_pdf_backend(pdf_backend),
5367
}
5468
}
5569

0 commit comments

Comments
 (0)