Allow building the lib for more flavors (#200)

ykhrustalev · web-flow · commit ad8c7b7236e2 · 2025-05-19T11:57:13.000+02:00
The library has limited support for embedded environments, such as iOS
or Mac Catalyst, all due to the huge dependency graph that hf-hub has.

This change makes it possible to strip the hf-hub dependency. The
current use for the hf-hub is preserved as a default feature.

Tested to work with:
* `cargo build --release --target aarch64-apple-ios
--no-default-features`
* `cargo build --release --target aarch64-apple-ios-macabi
--no-default-features`
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,13 +24,17 @@ regex-automata = "0.4.9"
 version = "=0.4.1"
 features = ["ureq", "rustls-tls"]
 default-features = false
+optional = true
 
 [dependencies.tokenizers]
 version = "=0.21.1"
-features = ["http", "rustls-tls"]
+features = ["onig"]
+default-features = false
 
 [features]
+default = ["hugginface-hub"]
 python-bindings = ["pyo3", "serde-pyobject"]
+hugginface-hub = ["hf-hub", "tokenizers/http",  "tokenizers/rustls-tls"]
 
 [lib]
 name = "outlines_core"
diff --git a/src/prelude.rs b/src/prelude.rs
@@ -1,5 +1,6 @@
 //! Library's interface essentials.
 
+#[cfg(feature = "hugginface-hub")]
 pub use tokenizers::FromPretrainedParameters;
 
 pub use super::index::Index;
diff --git a/src/python_bindings/mod.rs b/src/python_bindings/mod.rs
@@ -9,6 +9,7 @@ use pyo3::prelude::*;
 use pyo3::types::{PyAny, PyDict};
 use pyo3::wrap_pyfunction;
 use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet};
+#[cfg(feature = "hugginface-hub")]
 use tokenizers::FromPretrainedParameters;
 
 use crate::index::Index;
@@ -352,6 +353,7 @@ impl PyVocabulary {
     /// Creates the vocabulary of a pre-trained model.
     #[staticmethod]
     #[pyo3(signature = (model, revision=None, token=None))]
+    #[cfg(feature = "hugginface-hub")]
     fn from_pretrained(
         model: String,
         revision: Option<String>,
diff --git a/src/vocabulary/mod.rs b/src/vocabulary/mod.rs
@@ -1,22 +1,31 @@
 //! Creates `Vocabulary` manually or from pretrained large language model.
 
 use bincode::{Decode, Encode};
+#[cfg(feature = "hugginface-hub")]
 use locator::{HFLocator, Locator};
+#[cfg(feature = "hugginface-hub")]
 use processor::TokenProcessor;
 use rustc_hash::FxHashMap as HashMap;
+#[cfg(feature = "hugginface-hub")]
 use tokenizers::normalizers::Sequence;
+#[cfg(feature = "hugginface-hub")]
 use tokenizers::{NormalizerWrapper, Tokenizer};
 
 use crate::prelude::*;
 use crate::{Error, Result};
 
+#[cfg(feature = "hugginface-hub")]
 mod locator;
+#[cfg(feature = "hugginface-hub")]
 mod processor;
 
 /// `Vocabulary` of large language model.
 ///
 /// ## Examples
 ///
+#[cfg_attr(
+    feature = "hugginface-hub",
+    doc = r##"
 /// ### Create a vocabulary from a pretrained model.
 /// ```rust
 /// use outlines_core::prelude::*;
@@ -51,6 +60,8 @@ mod processor;
 /// vocabulary.remove("token");
 /// assert_eq!(vocabulary.token_ids("token"), None);
 /// ```
+"##
+)]
 #[derive(Clone, Debug, Default, PartialEq, Encode, Decode)]
 pub struct Vocabulary {
     eos_token_id: TokenId,
@@ -67,6 +78,7 @@ impl Vocabulary {
     }
 
     /// Creates the vocabulary of pre-trained model from Hugging Face Hub.
+    #[cfg(feature = "hugginface-hub")]
     pub fn from_pretrained(
         model: &str,
         parameters: Option<FromPretrainedParameters>,
@@ -76,6 +88,7 @@ impl Vocabulary {
 
     #[doc(hidden)]
     #[inline(always)]
+    #[cfg(feature = "hugginface-hub")]
     fn from_pretrained_with_locator<L: Locator>(
         model: &str,
         parameters: Option<FromPretrainedParameters>,
@@ -158,6 +171,7 @@ impl Vocabulary {
     }
 
     /// Filters out `Prepend` kind of tokenizer's normalizers.
+    #[cfg(feature = "hugginface-hub")]
     fn filter_prepend_normalizers(tokenizer: &mut Tokenizer) {
         // Main concern is prepend normalizers, for example https://github.com/google/sentencepiece
         // In `sentencepiece` tokenizer, `▁` is used to denote spaces in the source text,
@@ -248,8 +262,6 @@ impl TryFrom<(TokenId, HashMap<String, Vec<TokenId>>)> for Vocabulary {
 
 #[cfg(test)]
 mod tests {
-    use rustc_hash::FxHashSet as HashSet;
-
     use super::*;
 
     #[test]
@@ -305,6 +317,7 @@ mod tests {
         assert!(vocabulary.tokens.is_empty());
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn supported_pretrained_models() {
         // Support is expected for these:
@@ -332,6 +345,7 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn pretrained_from_gpt2() {
         let model = "openai-community/gpt2";
@@ -363,8 +377,11 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn pretrained_from_llama() {
+        use rustc_hash::FxHashSet as HashSet;
+
         let model = "hf-internal-testing/llama-tokenizer";
         let tokenizer = Tokenizer::from_pretrained(model, None).expect("Tokenizer failed");
         let vocabulary = Vocabulary::from_pretrained(model, None).expect("Vocabulary failed");
@@ -405,6 +422,7 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn token_processor_error() {
         let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM";
@@ -419,6 +437,7 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn tokenizer_error() {
         let model = "hf-internal-testing/some-non-existent-model";
@@ -430,7 +449,9 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     struct NoneLocator;
+    #[cfg(feature = "hugginface-hub")]
     impl Locator for NoneLocator {
         fn locate_eos_token_id(
             _model: &str,
@@ -441,6 +462,7 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "hugginface-hub")]
     #[test]
     fn unable_to_locate_eos_token_id_error() {
         let model = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM";
@@ -456,6 +478,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "hugginface-hub")]
     fn prepend_normalizers_filtered_out() {
         use tokenizers::normalizers::{Prepend, Sequence};
 
@@ -488,6 +511,7 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "hugginface-hub")]
     fn other_normalizers_being_kept() {
         use tokenizers::normalizers::BertNormalizer;