Skip to content

Commit ad8c7b7

Browse files
authored
Allow building the lib for more flavors (#200)
The library has limited support for embedded environments, such as iOS or Mac Catalyst, all due to the huge dependency graph that hf-hub has. This change makes it possible to strip the hf-hub dependency. The current use for the hf-hub is preserved as a default feature. Tested to work with: * `cargo build --release --target aarch64-apple-ios --no-default-features` * `cargo build --release --target aarch64-apple-ios-macabi --no-default-features`
1 parent a08c29f commit ad8c7b7

File tree

4 files changed

+34
-3
lines changed

4 files changed

+34
-3
lines changed

Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,17 @@ regex-automata = "0.4.9"
2424
version = "=0.4.1"
2525
features = ["ureq", "rustls-tls"]
2626
default-features = false
27+
optional = true
2728

2829
[dependencies.tokenizers]
2930
version = "=0.21.1"
30-
features = ["http", "rustls-tls"]
31+
features = ["onig"]
32+
default-features = false
3133

3234
[features]
35+
default = ["hugginface-hub"]
3336
python-bindings = ["pyo3", "serde-pyobject"]
37+
hugginface-hub = ["hf-hub", "tokenizers/http", "tokenizers/rustls-tls"]
3438

3539
[lib]
3640
name = "outlines_core"

src/prelude.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! Library's interface essentials.
22
3+
#[cfg(feature = "hugginface-hub")]
34
pub use tokenizers::FromPretrainedParameters;
45

56
pub use super::index::Index;

src/python_bindings/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use pyo3::prelude::*;
99
use pyo3::types::{PyAny, PyDict};
1010
use pyo3::wrap_pyfunction;
1111
use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
12+
#[cfg(feature = "hugginface-hub")]
1213
use tokenizers::FromPretrainedParameters;
1314

1415
use crate::index::Index;
@@ -352,6 +353,7 @@ impl PyVocabulary {
352353
/// Creates the vocabulary of a pre-trained model.
353354
#[staticmethod]
354355
#[pyo3(signature = (model, revision=None, token=None))]
356+
#[cfg(feature = "hugginface-hub")]
355357
fn from_pretrained(
356358
model: String,
357359
revision: Option<String>,

src/vocabulary/mod.rs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,31 @@
11
//! Creates `Vocabulary` manually or from pretrained large language model.
22
33
use bincode::{Decode, Encode};
4+
#[cfg(feature = "hugginface-hub")]
45
use locator::{HFLocator, Locator};
6+
#[cfg(feature = "hugginface-hub")]
57
use processor::TokenProcessor;
68
use rustc_hash::FxHashMap as HashMap;
9+
#[cfg(feature = "hugginface-hub")]
710
use tokenizers::normalizers::Sequence;
11+
#[cfg(feature = "hugginface-hub")]
812
use tokenizers::{NormalizerWrapper, Tokenizer};
913

1014
use crate::prelude::*;
1115
use crate::{Error, Result};
1216

17+
#[cfg(feature = "hugginface-hub")]
1318
mod locator;
19+
#[cfg(feature = "hugginface-hub")]
1420
mod processor;
1521

1622
/// `Vocabulary` of large language model.
1723
///
1824
/// ## Examples
1925
///
26+
#[cfg_attr(
27+
feature = "hugginface-hub",
28+
doc = r##"
2029
/// ### Create a vocabulary from a pretrained model.
2130
/// ```rust
2231
/// use outlines_core::prelude::*;
@@ -51,6 +60,8 @@ mod processor;
5160
/// vocabulary.remove("token");
5261
/// assert_eq!(vocabulary.token_ids("token"), None);
5362
/// ```
63+
"##
64+
)]
5465
#[derive(Clone, Debug, Default, PartialEq, Encode, Decode)]
5566
pub struct Vocabulary {
5667
eos_token_id: TokenId,
@@ -67,6 +78,7 @@ impl Vocabulary {
6778
}
6879

6980
/// Creates the vocabulary of pre-trained model from Hugging Face Hub.
81+
#[cfg(feature = "hugginface-hub")]
7082
pub fn from_pretrained(
7183
model: &str,
7284
parameters: Option<FromPretrainedParameters>,
@@ -76,6 +88,7 @@ impl Vocabulary {
7688

7789
#[doc(hidden)]
7890
#[inline(always)]
91+
#[cfg(feature = "hugginface-hub")]
7992
fn from_pretrained_with_locator<L: Locator>(
8093
model: &str,
8194
parameters: Option<FromPretrainedParameters>,
@@ -158,6 +171,7 @@ impl Vocabulary {
158171
}
159172

160173
/// Filters out `Prepend` kind of tokenizer's normalizers.
174+
#[cfg(feature = "hugginface-hub")]
161175
fn filter_prepend_normalizers(tokenizer: &mut Tokenizer) {
162176
// Main concern is prepend normalizers, for example https://github.com/google/sentencepiece
163177
// In `sentencepiece` tokenizer, `▁` is used to denote spaces in the source text,
@@ -248,8 +262,6 @@ impl TryFrom<(TokenId, HashMap<String, Vec<TokenId>>)> for Vocabulary {
248262

249263
#[cfg(test)]
250264
mod tests {
251-
use rustc_hash::FxHashSet as HashSet;
252-
253265
use super::*;
254266

255267
#[test]
@@ -305,6 +317,7 @@ mod tests {
305317
assert!(vocabulary.tokens.is_empty());
306318
}
307319

320+
#[cfg(feature = "hugginface-hub")]
308321
#[test]
309322
fn supported_pretrained_models() {
310323
// Support is expected for these:
@@ -332,6 +345,7 @@ mod tests {
332345
}
333346
}
334347

348+
#[cfg(feature = "hugginface-hub")]
335349
#[test]
336350
fn pretrained_from_gpt2() {
337351
let model = "openai-community/gpt2";
@@ -363,8 +377,11 @@ mod tests {
363377
}
364378
}
365379

380+
#[cfg(feature = "hugginface-hub")]
366381
#[test]
367382
fn pretrained_from_llama() {
383+
use rustc_hash::FxHashSet as HashSet;
384+
368385
let model = "hf-internal-testing/llama-tokenizer";
369386
let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed");
370387
let vocabulary = Vocabulary::from_pretrained(model, None).expect("Vocabulary failed");
@@ -405,6 +422,7 @@ mod tests {
405422
}
406423
}
407424

425+
#[cfg(feature = "hugginface-hub")]
408426
#[test]
409427
fn token_processor_error() {
410428
let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM";
@@ -419,6 +437,7 @@ mod tests {
419437
}
420438
}
421439

440+
#[cfg(feature = "hugginface-hub")]
422441
#[test]
423442
fn tokenizer_error() {
424443
let model = "hf-internal-testing/some-non-existent-model";
@@ -430,7 +449,9 @@ mod tests {
430449
}
431450
}
432451

452+
#[cfg(feature = "hugginface-hub")]
433453
struct NoneLocator;
454+
#[cfg(feature = "hugginface-hub")]
434455
impl Locator for NoneLocator {
435456
fn locate_eos_token_id(
436457
_model: &str,
@@ -441,6 +462,7 @@ mod tests {
441462
}
442463
}
443464

465+
#[cfg(feature = "hugginface-hub")]
444466
#[test]
445467
fn unable_to_locate_eos_token_id_error() {
446468
let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM";
@@ -456,6 +478,7 @@ mod tests {
456478
}
457479

458480
#[test]
481+
#[cfg(feature = "hugginface-hub")]
459482
fn prepend_normalizers_filtered_out() {
460483
use tokenizers::normalizers::{Prepend, Sequence};
461484

@@ -488,6 +511,7 @@ mod tests {
488511
}
489512

490513
#[test]
514+
#[cfg(feature = "hugginface-hub")]
491515
fn other_normalizers_being_kept() {
492516
use tokenizers::normalizers::BertNormalizer;
493517

0 commit comments

Comments
 (0)