Skip to content

Commit 48788a9

Browse files
Merge pull request #145 from StarlightSearch/model2vec
EmbedAnything v0.6 🚀
2 parents dd0fb9b + 577f7b1 commit 48788a9

File tree

18 files changed

+691
-57
lines changed

18 files changed

+691
-57
lines changed

Cargo.lock

Lines changed: 383 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ description = "Embed anything at lightning speed"
1515
repository = "https://github.com/StarlightSearch/EmbedAnything"
1616
authors = ["Akshay Ballal <arballal95@gmail.com>"]
1717
exclude = ["test_files/*", "python", "*.py", "pyproject.toml", "examples/images/*", "mkdocs.yml", "docs/*", "tests/*", ".github", "Dockerfile", "docs"]
18-
version = "0.5.6"
18+
version = "0.6.0"
1919

2020
[workspace.dependencies]
2121
pdf-extract = "0.7.7"

docs/blog/posts/ReleaseNotes5-5.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Super Excited to share the latest development in our library, which essentially
1111

1212
<!-- more -->
1313

14-
## Support for late chunking.
14+
## Late Chunking
1515

1616
The new 0.5.6 version adds Late Chunking to EmbedAnything, a technique introduced by Jina AI and Weaviate.
1717
Here's how we've implemented Late Chunking in EA:
@@ -46,7 +46,7 @@ data: list[EmbedData] = model.embed_file("test_files/attention.pdf", config=conf
4646
```
4747

4848

49-
## 𝘊𝘰𝘩𝘦𝘳𝘦 𝘌𝘮𝘣𝘦𝘥 4:
49+
## Cohere Embed 4:
5050

5151
🧊 Single embedding per document, even for multimodal inputs
5252
📚 Handles up to 128K tokens – perfect for long-form business documents
@@ -77,11 +77,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_hf(
7777

7878
## Processor Crate:
7979

80-
This crate contains various "processors" that accept files/folders/bytes and produced a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation!
81-
82-
83-
84-
80+
This crate contains various "processors" that accepts files and produces a chunked, metadata-rich document description. This is especially helpful for retrieval-augmented generation! This also supports PDF text extraction via two different backends at the moment: MuPDF and LoPDF. You can choose between the two by either passing `pdf_backend` as `mupdf` or `lopdf` in `TextEmbedConfig`
8581

8682
We have also received some additional cool feature requests on GitHub, which we would like to implement. If you want to help out please check out EmbedAnything on GitHub. We would love to have a contribution. 🚀
8783

examples/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
def embed_directory_example():
1414
# Configure the embedding process
1515
config = TextEmbedConfig(
16-
chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence"
16+
chunk_size=1000, batch_size=32, buffer_size=64, splitting_strategy="sentence", pdf_backend="lopdf"
1717
)
1818

1919
# Start timing
@@ -73,7 +73,7 @@ def embed_file_example():
7373
# Example 4: Embed files in a batch
7474
def embed_files_batch_example():
7575

76-
config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64)
76+
config = TextEmbedConfig(chunk_size = 1000, batch_size = 32, buffer_size = 64, pdf_backend="mupdf")
7777

7878
data = model.embed_files_batch(["test_files/bank.txt", "test_files/test.pdf"])
7979

processors/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ htmd = "0.1.6"
2323

2424
# PDF processing
2525
pdf-extract = {workspace = true}
26+
mupdf = "0.5.0"
2627
docx-parser = "0.1.1"
2728
pdf2image = "0.1.3"
2829
image = "0.25.6"

processors/src/pdf/pdf_processor.rs

Lines changed: 69 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,42 @@
1-
use std::path::Path;
1+
use crate::markdown_processor::MarkdownProcessor;
2+
use crate::pdf::tesseract::input::{Args, Image};
3+
use crate::processor::{Document, DocumentProcessor, FileProcessor};
24
use anyhow::Error;
35
use image::DynamicImage;
46
use pdf2image::{Pages, RenderOptionsBuilder, PDF};
7+
use std::path::Path;
58
use text_splitter::ChunkConfigError;
6-
use crate::markdown_processor::MarkdownProcessor;
7-
use crate::pdf::tesseract::input::{Args, Image};
8-
use crate::processor::{Document, DocumentProcessor, FileProcessor};
9+
10+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11+
pub enum PdfBackend {
12+
LoPdf,
13+
MuPdf,
14+
}
915

1016
/// A struct for processing PDF files.
1117
pub struct PdfProcessor {
1218
markdown_processor: MarkdownProcessor,
1319
ocr_config: OcrConfig,
20+
backend: PdfBackend,
1421
}
1522

1623
pub struct OcrConfig {
1724
pub use_ocr: bool,
18-
pub tesseract_path: Option<String>
25+
pub tesseract_path: Option<String>,
1926
}
2027

2128
impl PdfProcessor {
22-
pub fn new(chunk_size: usize, overlap: usize, ocr_config: OcrConfig) -> Result<PdfProcessor, ChunkConfigError> {
29+
pub fn new(
30+
chunk_size: usize,
31+
overlap: usize,
32+
ocr_config: OcrConfig,
33+
backend: PdfBackend,
34+
) -> Result<PdfProcessor, ChunkConfigError> {
2335
let markdown_processor = MarkdownProcessor::new(chunk_size, overlap)?;
2436
Ok(PdfProcessor {
2537
markdown_processor,
2638
ocr_config,
39+
backend,
2740
})
2841
}
2942
}
@@ -34,16 +47,49 @@ impl FileProcessor for PdfProcessor {
3447
let tesseract_path = self.ocr_config.tesseract_path.as_deref();
3548
extract_text_with_ocr(&path, tesseract_path)?
3649
} else {
37-
pdf_extract::extract_text(path).map_err(|e| anyhow::anyhow!(e))?
50+
match self.backend {
51+
PdfBackend::MuPdf => {
52+
let mut page_texts = Vec::new();
53+
{
54+
let document = mupdf::document::Document::open(path.as_ref())?;
55+
let pages = document.pages()?;
56+
57+
for (page_number, page_result) in pages.enumerate() {
58+
let page = page_result?;
59+
let text_page =
60+
page.to_text_page(mupdf::text_page::TextPageOptions::empty())?;
61+
62+
let mut page_text = String::new();
63+
for block in text_page.blocks() {
64+
for line in block.lines() {
65+
let chars: String =
66+
line.chars().map(|c| c.char().unwrap_or(' ')).collect();
67+
page_text.push_str(&chars);
68+
page_text.push('\n');
69+
}
70+
page_text.push('\n');
71+
}
72+
73+
page_texts.push((page_number, page_text));
74+
}
75+
}
76+
page_texts
77+
.into_iter()
78+
.map(|(_, text)| text)
79+
.collect::<Vec<String>>()
80+
.join("\n")
81+
}
82+
PdfBackend::LoPdf => {
83+
pdf_extract::extract_text(path.as_ref()).map_err(|e| anyhow::anyhow!(e))?
84+
}
85+
}
3886
};
39-
87+
4088
self.markdown_processor.process_document(&content)
4189
}
4290
}
4391

44-
fn get_images_from_pdf<T: AsRef<Path>>(
45-
file_path: &T,
46-
) -> Result<Vec<DynamicImage>, Error> {
92+
fn get_images_from_pdf<T: AsRef<Path>>(file_path: &T) -> Result<Vec<DynamicImage>, Error> {
4793
let pdf = PDF::from_file(file_path)?;
4894
let page_count = pdf.page_count();
4995
let pages = pdf.render(
@@ -68,15 +114,15 @@ fn extract_text_with_ocr<T: AsRef<Path>>(
68114
.iter()
69115
.map(|image| extract_text_from_image(image, &Args::default().with_path(tesseract_path)))
70116
.collect();
71-
117+
72118
// Join the texts and clean up empty lines
73119
let text = texts?.join("\n");
74120
let cleaned_text = text
75121
.lines()
76122
.filter(|line| !line.trim().is_empty())
77123
.collect::<Vec<&str>>()
78124
.join("\n");
79-
125+
80126
Ok(cleaned_text)
81127
}
82128

@@ -90,7 +136,16 @@ mod tests {
90136
fn test_extract_text() {
91137
let temp_dir = TempDir::new("example").unwrap();
92138
let pdf_file = temp_dir.path().join("test.pdf");
93-
let processor = PdfProcessor::new(128, 0, OcrConfig { use_ocr: false, tesseract_path: None }).unwrap();
139+
let processor = PdfProcessor::new(
140+
128,
141+
0,
142+
OcrConfig {
143+
use_ocr: false,
144+
tesseract_path: None,
145+
},
146+
PdfBackend::MuPdf,
147+
)
148+
.unwrap();
94149

95150
File::create(pdf_file).unwrap();
96151

python/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ crate-type = ["cdylib"]
99

1010
[dependencies]
1111
embed_anything = {path = "../rust", features = ["ort"]}
12+
processors = {path = "../processors"}
1213
pyo3 = { version = "0.23.2"}
1314
tokio = { version = "1.39.0", features = ["rt-multi-thread"]}
1415
strum = {workspace = true}

python/python/embed_anything/_embed_anything.pyi

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ class TextEmbedConfig:
502502
semantic_encoder: The semantic encoder for the Text Embedding model. Default is None.
503503
use_ocr: A flag indicating whether to use OCR for the Text Embedding model. Default is False.
504504
tesseract_path: The path to the Tesseract OCR executable. Default is None and uses the system path.
505+
pdf_backend: The backend to use for PDF text extraction. Options are `mupdf` and `lopdf`. Default is `lopdf`.
505506
"""
506507

507508
def __init__(
@@ -515,6 +516,7 @@ class TextEmbedConfig:
515516
semantic_encoder: EmbeddingModel | None = None,
516517
use_ocr: bool | None = False,
517518
tesseract_path: str | None = None,
519+
pdf_backend: str | None = "lopdf",
518520
):
519521
self.chunk_size = chunk_size
520522
self.overlap_ratio = overlap_ratio
@@ -525,6 +527,7 @@ class TextEmbedConfig:
525527
self.semantic_encoder = semantic_encoder
526528
self.use_ocr = use_ocr
527529
self.tesseract_path = tesseract_path
530+
self.pdf_backend = pdf_backend
528531
chunk_size: int | None
529532
overlap_ratio: float | None
530533
batch_size: int | None
@@ -534,6 +537,7 @@ class TextEmbedConfig:
534537
semantic_encoder: EmbeddingModel | None
535538
use_ocr: bool | None
536539
tesseract_path: str | None
540+
pdf_backend: str | None
537541

538542
class ImageEmbedConfig:
539543
"""
@@ -853,6 +857,7 @@ class WhichModel(Enum):
853857
Cohere = ("Cohere",)
854858
CohereVision = ("CohereVision",)
855859
Bert = ("Bert",)
860+
Model2Vec = ("Model2Vec",)
856861
Jina = ("Jina",)
857862
Clip = ("Clip",)
858863
Colpali = ("Colpali",)

python/src/config.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use crate::EmbeddingModel;
22
use embed_anything::config::SplittingStrategy;
33
use pyo3::prelude::*;
4+
use processors::pdf::pdf_processor::PdfBackend;
45

56
#[pyclass]
67
#[derive(Default)]
@@ -12,7 +13,7 @@ pub struct TextEmbedConfig {
1213
#[pymethods]
1314
impl TextEmbedConfig {
1415
#[new]
15-
#[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None))]
16+
#[pyo3(signature = (chunk_size=None, batch_size=None, late_chunking=None, buffer_size=None, overlap_ratio=None, splitting_strategy=None, semantic_encoder=None, use_ocr=None, tesseract_path=None, pdf_backend=None))]
1617
pub fn new(
1718
chunk_size: Option<usize>,
1819
batch_size: Option<usize>,
@@ -23,7 +24,19 @@ impl TextEmbedConfig {
2324
semantic_encoder: Option<&EmbeddingModel>,
2425
use_ocr: Option<bool>,
2526
tesseract_path: Option<&str>,
27+
pdf_backend: Option<&str>,
2628
) -> Self {
29+
let pdf_backend = match pdf_backend {
30+
Some(backend) => {
31+
match backend {
32+
"mupdf" => PdfBackend::MuPdf,
33+
"lopdf" => PdfBackend::LoPdf,
34+
_ => panic!("Unknown PDF backend provided!"),
35+
}
36+
}
37+
None => PdfBackend::LoPdf,
38+
};
39+
2740
let strategy = match splitting_strategy {
2841
Some(strategy) => {
2942
match strategy {
@@ -49,7 +62,8 @@ impl TextEmbedConfig {
4962
.with_buffer_size(buffer_size.unwrap_or(100))
5063
.with_splitting_strategy(strategy)
5164
.with_late_chunking(late_chunking.unwrap_or(false))
52-
.with_ocr(use_ocr.unwrap_or(false), tesseract_path),
65+
.with_ocr(use_ocr.unwrap_or(false), tesseract_path)
66+
.with_pdf_backend(pdf_backend),
5367
}
5468
}
5569

python/src/lib.rs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,14 @@
11
pub mod config;
22
pub mod models;
33
use embed_anything::embeddings::embed::{TextEmbedder, VisionEmbedder};
4-
use embed_anything::{self, config::TextEmbedConfig, emb_audio, embeddings::embed::{Embedder, EmbeddingResult}, file_processor::audio::audio_processor, FileLoadingError};
4+
use embed_anything::{
5+
self,
6+
config::TextEmbedConfig,
7+
emb_audio,
8+
embeddings::embed::{Embedder, EmbeddingResult},
9+
file_processor::audio::audio_processor,
10+
FileLoadingError,
11+
};
512
use models::colbert::ColbertModel;
613
use models::colpali::ColpaliModel;
714
use models::reranker::{DocumentRank, Dtype, Reranker, RerankerResult};
@@ -84,6 +91,7 @@ pub enum WhichModel {
8491
Cohere,
8592
CohereVision,
8693
Bert,
94+
Model2Vec,
8795
SparseBert,
8896
ColBert,
8997
Clip,
@@ -221,12 +229,25 @@ impl EmbeddingModel {
221229
embed_anything::embeddings::local::jina::JinaEmbedder::new(
222230
model_id, revision, token,
223231
)
224-
.unwrap(),
232+
.map_err(|e| PyValueError::new_err(e.to_string()))?,
225233
)));
226234
Ok(EmbeddingModel {
227235
inner: Arc::new(model),
228236
})
229237
}
238+
WhichModel::Model2Vec => {
239+
let model_id = model_id.unwrap_or("minishlab/potion-base-8M");
240+
let model = Embedder::Text(TextEmbedder::Model2Vec(Box::new(
241+
embed_anything::embeddings::local::model2vec::Model2VecEmbedder::new(
242+
model_id, token, None,
243+
)
244+
.map_err(|e| PyValueError::new_err(e.to_string()))?,
245+
)));
246+
247+
Ok(EmbeddingModel {
248+
inner: Arc::new(model),
249+
})
250+
}
230251
WhichModel::Colpali => {
231252
let model_id = model_id.unwrap_or("vidore/colpali-v1.2-merged");
232253
let model = Embedder::Vision(VisionEmbedder::ColPali(Box::new(

rust/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ indicatif = "0.17.8"
6868
statistical = "1.0.0"
6969
half = "2.4.1"
7070
candle-flash-attn = { workspace = true, optional = true }
71+
model2vec-rs = "0.1.0"
7172

7273
# Logging
7374
log = "0.4"
@@ -98,4 +99,4 @@ rustls-tls = [
9899
"reqwest/rustls-tls",
99100
"hf-hub/rustls-tls",
100101
"tokenizers/rustls-tls"
101-
]
102+
]

0 commit comments

Comments
 (0)