From 421edf1d296171cbe4e0106571748e938b1d3977 Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 20 May 2025 21:51:47 +0000 Subject: [PATCH 1/5] Add documentation for expectations and standards related to model formats/configs --- docs/architecture/model_formats.md | 532 +++++++++++++++++++++++++++++ 1 file changed, 532 insertions(+) create mode 100644 docs/architecture/model_formats.md diff --git a/docs/architecture/model_formats.md b/docs/architecture/model_formats.md new file mode 100644 index 0000000..0229dd3 --- /dev/null +++ b/docs/architecture/model_formats.md @@ -0,0 +1,532 @@ +# Model Formats + +## What's Covered + +In this document, you will learn: + +1. The structure and organization of Speculators model formats +2. How to save, load, and share speculator models across the ecosystem +3. The design principles that guide Speculators' model format implementation + +Before reading this document, you should be familiar with the following concepts: + +- Basic understanding of machine learning model formats and serialization +- Familiarity with Hugging Face's model hub and Transformers library +- General knowledge of speculative decoding and its benefits + +## Overview + +Speculators employs a model format designed with ecosystem compatibility and user experience as top priorities. This format serves as a standard and bridge between various speculative decoding implementations and the broader machine learning ecosystem, particularly focusing on compatibility with Hugging Face's model hub, Speculators' own implementations built on top of Transformers, and other platforms like vLLM. With the Hugging Face format as a foundation, Speculators enables extensibility and usability from practioners consuming existing algorithms to researchers developing new ones. + +The core components that make up the Speculators model format are the following, which are explained in detail in dedicated sections below: + +- Hugging Face compatible directory structure and file formats +- Speculators-specific extensions through the `speculators_config` key in the `config.json` file +- Standardized interfaces for saving, loading, and pushing models to the Hugging Face Hub + +Complete examples are provided at the end of this document for popular speculative decoding algorithms. + +## Components + +### Base Model Format + +The base model format for Speculators builds directly on the standard Hugging Face model format and represents the specific draft model, ensuring compatability with existing tools, libraries, and workflows. A typical Speculators model directory contains: + +``` +model_directory/ +├── config.json # Model configuration file +├── model.safetensors # Model weights in safetensors format +└── README.md # Optional documentation +``` + +Where `config.json` and `model.safetensors` define the draft model and its weights, respectively. This has the additional benefit where independent draft models are stored in the same format with the later speculators-specific extensions. + +#### `config.json` + +The `config.json` file represents specifics about the architecture, hyperparams, and configurations for the draft model. It extends Transformers PretrainedConfig class and serves as the central configuration for a Speculators model, containing all necessary metadata and parameters. Core required keys in the config file include: + +- `architectures`: A list containing the model architecture class for the Speculators draft model (e.g., "MLPDraftModel"), or the Hugging Face model class for independent draft models. +- `model_type`: A string identifying the Speculators draft model type (e.g., "mlp_draft_model"), or the Hugging Face model type for independent draft models. +- `torch_dtype`: The data type used for the model weights (e.g., "float32", "bfloat16", "float16") +- `transformers_version`: The version of Transformers used to create the model +- `speculators_version`: The version of Speculators used to create the model +- `inputs`: A list defining the expected inputs to the draft model in relation to the attachment points to the verifier (e.g., "input_ids", "layer.0") + +Additionally, the file contains implementation-specific keys that define the draft model's architecture and hyperparameters, such as hidden layer dimensions, activation functions, and other configuration parameters depending on the specific Drafter architecture. + +Example of a minimal `config.json`: + +```json +{ + "architectures": ["MLPDraftModel"], + "model_type": "mlp_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "4.35.0", + "speculators_version": "0.1.0", + "inputs": ["input_ids"], + "...": "..." +} +``` + +Future versions of the format may support additional keys for advanced features, such as quantization and compression methods, to further enhance the model's performance and usability. + +#### `safetensors` Files + +Speculators adopts the `safetensors` format as the standard for storing the draft model's weights, providing multiple benefits: + +- **Security**: Unlike pickle-based formats, safetensors is designed to be secure against arbitrary code execution +- **Efficiency**: Optimized for fast loading times and memory mapping capabilities +- **Compatibility**: Widespread adoption across the Hugging Face ecosystem and related tools +- **Mmap Support**: Allows accessing tensor data without loading the entire model into memory + +A typical Speculators model includes one or more safetensors files containing the serialized weights of the draft model: + +``` +model_directory/ +├── model.safetensors # Single file format for smaller models +# OR +├── model-00001-of-00003.safetensors # Sharded format for larger models +├── model-00002-of-00003.safetensors +└── model-00003-of-00003.safetensors +``` + +Future versions of the format may support additional file formats, such as compressed tensors formats, built on top of the safetensors format, to further enhance model size and loading performance. + +### Speculators Format Extensions + +The Speculators specific extensions to the model format are centralized within the `config.json` file, specifically under the `speculators_config` key. This design choice allows Speculators to maintain compatibility with the Hugging Face model format while providing additional functionality tailored to speculative decoding. + +The `speculators_config` dictionary contains subkeys that are broken apart by the core concepts for speculative decoding inference, with the following keys: + +- `algorithm`: The name of the speculative decoding algorithm used by the model +- `proposal_methods`: A list of dictionaries defining the supported token proposal strategies for the speculator +- `default_proposal_method`: The default token proposal method to use when generating tokens +- `verifier`: A dictionary defining the target verifier model for which the speculator was created + +#### Algorithm + +The `algorithm` field is a required string that specifies the speculative decoding algorithm used by the model. It serves as a primary identifier for the algorithm that the speculator model was created with and its intended to use. It additionally allows the Speculators library, and other tools, to automatically load the correct implementation and associated utilities when a model is loaded. + +This field must match one of the supported algorithms in the Speculators library, such as: + +- `"eagle"`, `"eagle_2"`, `"eagle_3"` - Eagle speculator variants based on Transformer architecture for the draft model +- `"hass"` - Similar to Eagle based on the Transformer architecture for the draft model +- `"mlp_speculator"` - Based on a multi-layer perceptron (MLP) architecture for the draft model +- `"specdec"` - An independent speculator model + +Example usage in a config: + +```json +"speculators_config": { + "algorithm": "eagle", + "...": "..." +} +``` + +#### Token Proposal Methods + +The `proposal_methods` field is a required list of dictionaries that defines the supported token proposal strategies for a speculator. This field works alongside the `default_proposal_method` key, which specifies which method to use by default. + +Token proposal methods determine how a speculator generates candidate tokens and how these tokens are verified with the verifier model. By supporting multiple potential methods and enabling the algorithm to define what is intended to work, they provide adaptability to different use cases and performance requirements. The method specific parameters are intended as the best defaults to ship the model with, but implementations may optionally allow users to override these parameters at runtime. + +Each dictionary in the `proposal_methods` list must contain: + +- A `proposal_type` key identifying the method (e.g., "greedy", "sample", "tree") +- Method-specific configuration parameters that control the behavior of the proposal strategy + +Some common proposal methods include: + +- `"greedy"`: Deterministic decoding that selects the highest probability token at each step +- `"sample"`: Samples from the top-p probability mass of the predicted tokens at each step +- `"tree"`: Generates a tree of possible tokens for more likely matches at each step + +Example usage in a config: + +```json +"proposal_methods": [ + { + "proposal_type": "greedy", + "...": "..." + }, + { + "proposal_type": "sample", + "...": "..." + } +], +"default_proposal_method": "greedy" +``` + +#### Verifier + +The `verifier` field is a required dictionary that defines the target verifier model for which the draft model was created for. The dictionary serves two primary purposes: + +1. Enabling automatic loading of the associated verifier model if one is not provided at runtime +2. Providing validation parameters to ensure compatibility when using different verifiers with a trained draft model + +There are a number of required and optional keys within the `verifier` dict enabling the above functionality: + +- Keys for automatic loading + - `name_or_path`: The Hugging Face model ID or local path to automatically load the verifier +- Keys for model architecture compatability validation + - `architectures`: List of model architecture classes for compatibility validation + - `hidden_size`: Hidden dimension size used for compatibility checks + - `intermediate_size`: Intermediate dimension size for compatibility validation + - Additional model configuration parameters from the original verifier model +- Keys for tokenizer compatability validation + - `vocab_size`: Size of the vocabulary used by the tokenizer + - `max_position_embeddings`: Maximum position embeddings supported + - `bos_token_id`: Beginning of sequence token ID + - `eos_token_id`: End of sequence token ID + - Additional tokenizer configuration parameters from the original verifier model + +Example of a verifier configuration: + +```json +"verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] +} +``` + +### Interfaces + +The Speculators library provides a set of standardized interfaces for saving, loading, and pushing models to the Hugging Face Hub. These interfaces build on top of and maintain compatibility with the core interfaces in both PyTorch and the Transformers library, ensuring a familiar experience for users of these ecosystems. + +#### Speculator Class + +The core `Speculator` class that implements speculative decoding algorithms extends PyTorch's `nn.Module` and Transformers' `PushToHubMixin`, providing a consistent interface for model operations: + +- `from_pretrained()`: Loads a pretrained speculator model from a local directory or the Hugging Face Hub +- `save_pretrained()`: Saves a speculator model to a local directory with optional uploading to the Hugging Face Hub +- `push_to_hub()`: Directly pushes a model to the Hugging Face Hub for sharing with the community + +For detailed usage and examples, refer to the [entrypoints documentation](./entrypoints.md). + +#### SpeculatorConfig Class + +The `SpeculatorConfig` class extends the Transformers `PretrainedConfig` class, providing compatibile APIs with the following methods: + +- `from_pretrained()`: Loads a speculator configuration from a local directory or the Hugging Face Hub +- `save_pretrained()`: Saves a speculator configuration to a local directory +- `push_to_hub()`: Directly pushes a configuration to the Hugging Face Hub for sharing with the community + +For detailed usage and examples, refer to the [entrypoints documentation](./entrypoints.md). + +## Examples + +This section provides example `config.json` files for popular speculative decoding algorithms. + +### Eagle + +```json +{ + "architectures": ["TransformerDraftModel"], + "model_type": "transformer_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "X.X.X", + "speculators_version": "X.X.X", + "inputs": ["input_ids", "TODO"], + "extra_args": "TODO", + "speculators_config": { + "algorithm": "eagle", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + }, + { + "proposal_type": "sample", + "max_new_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + }, + { + "proposal_type": "tree", + "extra_args": "TODO" + } + ], + "default_proposal_method": "tree", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + } +} +``` + +### Eagle-2 + +```json +{ + "architectures": ["TransformerDraftModel"], + "model_type": "transformer_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "X.X.X", + "speculators_version": "X.X.X", + "inputs": ["input_ids", "TODO"], + "extra_args": "TODO", + "speculators_config": { + "algorithm": "eagle_2", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + }, + { + "proposal_type": "sample", + "max_new_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + }, + { + "proposal_type": "tree", + "extra_args": "TODO" + } + ], + "default_proposal_method": "tree", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + } +} +``` + +### Eagle-3 + +```json +{ + "architectures": ["TransformerDraftModel"], + "model_type": "transformer_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "X.X.X", + "speculators_version": "X.X.X", + "inputs": ["input_ids", "TODO"], + "extra_args": "TODO", + "speculators_config": { + "algorithm": "eagle_3", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + }, + { + "proposal_type": "sample", + "max_new_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + }, + { + "proposal_type": "tree", + "extra_args": "TODO" + } + ], + "default_proposal_method": "tree", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + } +} +``` + +### HASS + +```json +{ + "architectures": ["TransformerDraftModel"], + "model_type": "transformer_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "X.X.X", + "speculators_version": "X.X.X", + "inputs": ["input_ids", "TODO"], + "extra_args": "TODO", + "speculators_config": { + "algorithm": "hass", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + }, + { + "proposal_type": "sample", + "max_new_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + }, + { + "proposal_type": "tree", + "extra_args": "TODO" + } + ], + "default_proposal_method": "tree", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + } +} +``` + +### MLP Speculator + +```json +{ + "architectures": ["MLPDraftModel"], + "model_type": "mlp_draft_model", + "torch_dtype": "bfloat16", + "transformers_version": "X.X.X", + "speculators_version": "X.X.X", + "inputs": ["input_ids", "TODO"], + "extra_args": "TODO", + "speculators_config": { + "algorithm": "mlp_speculator", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + }, + { + "proposal_type": "sample", + "max_new_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + } + ], + "default_proposal_method": "sample", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + } +} +``` + +### SpecDec + +```json +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.45.0.dev0", + "use_cache": true, + "vocab_size": 128256, + "speculators_version": "X.X.X", + "speculators_config": { + "algorithm": "specdec", + "proposal_methods": [ + { + "proposal_type": "greedy", + "max_new_tokens": 5, + } + ], + "default_proposal_method": "greedy", + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } + }, +} +``` From 5859cd1f08bfff3f11752ce8d1c81c96d53b0f5f Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 20 May 2025 22:00:39 +0000 Subject: [PATCH 2/5] updates for grammar and wording --- docs/architecture/model_formats.md | 52 +++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/docs/architecture/model_formats.md b/docs/architecture/model_formats.md index 0229dd3..d137072 100644 --- a/docs/architecture/model_formats.md +++ b/docs/architecture/model_formats.md @@ -4,7 +4,7 @@ In this document, you will learn: -1. The structure and organization of Speculators model formats +1. The structure and organization of the Speculators model formats 2. How to save, load, and share speculator models across the ecosystem 3. The design principles that guide Speculators' model format implementation @@ -16,21 +16,21 @@ Before reading this document, you should be familiar with the following concepts ## Overview -Speculators employs a model format designed with ecosystem compatibility and user experience as top priorities. This format serves as a standard and bridge between various speculative decoding implementations and the broader machine learning ecosystem, particularly focusing on compatibility with Hugging Face's model hub, Speculators' own implementations built on top of Transformers, and other platforms like vLLM. With the Hugging Face format as a foundation, Speculators enables extensibility and usability from practioners consuming existing algorithms to researchers developing new ones. +Speculators employs a model format designed with ecosystem compatibility and user experience as top priorities. This format serves as a standard and bridge between various speculative decoding implementations and the broader machine learning ecosystem, particularly focusing on compatibility with Hugging Face's model hub, Speculators' implementations built on top of Transformers, and other platforms like vLLM. With the Hugging Face format as a foundation, Speculators enables extensibility and usability for practitioners consuming existing algorithms and researchers developing new ones. -The core components that make up the Speculators model format are the following, which are explained in detail in dedicated sections below: +The core components that make up the Speculators model format are the following: - Hugging Face compatible directory structure and file formats - Speculators-specific extensions through the `speculators_config` key in the `config.json` file - Standardized interfaces for saving, loading, and pushing models to the Hugging Face Hub -Complete examples are provided at the end of this document for popular speculative decoding algorithms. +At the end of this document, examples of configuration files for popular algorithms are provided. ## Components ### Base Model Format -The base model format for Speculators builds directly on the standard Hugging Face model format and represents the specific draft model, ensuring compatability with existing tools, libraries, and workflows. A typical Speculators model directory contains: +The base model format for Speculators builds directly on the standard Hugging Face model format and represents the specific draft model, ensuring compatibility with existing tools, libraries, and workflows. A typical Speculators model directory contains: ``` model_directory/ @@ -39,18 +39,18 @@ model_directory/ └── README.md # Optional documentation ``` -Where `config.json` and `model.safetensors` define the draft model and its weights, respectively. This has the additional benefit where independent draft models are stored in the same format with the later speculators-specific extensions. +Where `config.json` and `model.safetensors` define the draft model and its weights, respectively. Standardizing the `config.json` file as the base for the draft model has the additional benefit for independent draft models, so they only need the speculators-specific addition. #### `config.json` -The `config.json` file represents specifics about the architecture, hyperparams, and configurations for the draft model. It extends Transformers PretrainedConfig class and serves as the central configuration for a Speculators model, containing all necessary metadata and parameters. Core required keys in the config file include: +The `config.json` file represents specifics about the draft model's architecture, hyperparams, and configurations. It extends the Transformers PretrainedConfig class and serves as the central configuration for a Speculators model, containing all necessary metadata and parameters. Core required keys in the config file include: - `architectures`: A list containing the model architecture class for the Speculators draft model (e.g., "MLPDraftModel"), or the Hugging Face model class for independent draft models. - `model_type`: A string identifying the Speculators draft model type (e.g., "mlp_draft_model"), or the Hugging Face model type for independent draft models. - `torch_dtype`: The data type used for the model weights (e.g., "float32", "bfloat16", "float16") - `transformers_version`: The version of Transformers used to create the model - `speculators_version`: The version of Speculators used to create the model -- `inputs`: A list defining the expected inputs to the draft model in relation to the attachment points to the verifier (e.g., "input_ids", "layer.0") +- `inputs`: A list defining the expected inputs to the draft model about the attachment points to the verifier (e.g., "input_ids", "layer.0") Additionally, the file contains implementation-specific keys that define the draft model's architecture and hyperparameters, such as hidden layer dimensions, activation functions, and other configuration parameters depending on the specific Drafter architecture. @@ -68,13 +68,13 @@ Example of a minimal `config.json`: } ``` -Future versions of the format may support additional keys for advanced features, such as quantization and compression methods, to further enhance the model's performance and usability. +Future format versions may support additional keys for advanced features, such as quantization and compression methods, to further enhance the model's performance and usability. #### `safetensors` Files Speculators adopts the `safetensors` format as the standard for storing the draft model's weights, providing multiple benefits: -- **Security**: Unlike pickle-based formats, safetensors is designed to be secure against arbitrary code execution +- **Security**: Unlike pickle-based formats, is secure against arbitrary code execution - **Efficiency**: Optimized for fast loading times and memory mapping capabilities - **Compatibility**: Widespread adoption across the Hugging Face ecosystem and related tools - **Mmap Support**: Allows accessing tensor data without loading the entire model into memory @@ -90,7 +90,7 @@ model_directory/ └── model-00003-of-00003.safetensors ``` -Future versions of the format may support additional file formats, such as compressed tensors formats, built on top of the safetensors format, to further enhance model size and loading performance. +Future versions of the format may support additional file formats, such as compressed tensors formats, built on top of the safetensors format, to enhance model size and loading performance further. ### Speculators Format Extensions @@ -101,11 +101,11 @@ The `speculators_config` dictionary contains subkeys that are broken apart by th - `algorithm`: The name of the speculative decoding algorithm used by the model - `proposal_methods`: A list of dictionaries defining the supported token proposal strategies for the speculator - `default_proposal_method`: The default token proposal method to use when generating tokens -- `verifier`: A dictionary defining the target verifier model for which the speculator was created +- `verifier`: A dictionary defining the target verifier model the speculator was created for #### Algorithm -The `algorithm` field is a required string that specifies the speculative decoding algorithm used by the model. It serves as a primary identifier for the algorithm that the speculator model was created with and its intended to use. It additionally allows the Speculators library, and other tools, to automatically load the correct implementation and associated utilities when a model is loaded. +The `algorithm` field is a required string that specifies the speculative decoding algorithm used by the model. It serves as a primary identifier for the algorithm with which the speculator model was created and intended to be used. It additionally allows the Speculators library and other tools to automatically load the correct implementation and associated utilities when a model is loaded. This field must match one of the supported algorithms in the Speculators library, such as: @@ -125,9 +125,9 @@ Example usage in a config: #### Token Proposal Methods -The `proposal_methods` field is a required list of dictionaries that defines the supported token proposal strategies for a speculator. This field works alongside the `default_proposal_method` key, which specifies which method to use by default. +The `proposal_methods` field is a required list of dictionaries defining a speculator's supported token proposal strategies. This field works alongside the `default_proposal_method` key, which specifies which method to use by default. -Token proposal methods determine how a speculator generates candidate tokens and how these tokens are verified with the verifier model. By supporting multiple potential methods and enabling the algorithm to define what is intended to work, they provide adaptability to different use cases and performance requirements. The method specific parameters are intended as the best defaults to ship the model with, but implementations may optionally allow users to override these parameters at runtime. +Token proposal methods determine how a speculator generates candidate tokens and how these tokens are verified with the verifier model. By supporting multiple potential methods and enabling the algorithm to define what should work, they provide adaptability to different use cases and performance requirements. The method-specific parameters are the best defaults to ship the model, but implementations may optionally allow users to override these parameters at runtime. Each dictionary in the `proposal_methods` list must contain: @@ -144,35 +144,35 @@ Example usage in a config: ```json "proposal_methods": [ - { + { "proposal_type": "greedy", "...": "..." - }, - { + }, + { "proposal_type": "sample", "...": "..." - } + } ], "default_proposal_method": "greedy" ``` #### Verifier -The `verifier` field is a required dictionary that defines the target verifier model for which the draft model was created for. The dictionary serves two primary purposes: +The `verifier` field is a required dictionary defining the target verifier model for which the draft model was created. The dictionary serves two primary purposes: 1. Enabling automatic loading of the associated verifier model if one is not provided at runtime 2. Providing validation parameters to ensure compatibility when using different verifiers with a trained draft model -There are a number of required and optional keys within the `verifier` dict enabling the above functionality: +There are several required and optional keys within the `verifier` dict, enabling the above functionality: - Keys for automatic loading - `name_or_path`: The Hugging Face model ID or local path to automatically load the verifier -- Keys for model architecture compatability validation +- Keys for model architecture compatibility validation - `architectures`: List of model architecture classes for compatibility validation - `hidden_size`: Hidden dimension size used for compatibility checks - `intermediate_size`: Intermediate dimension size for compatibility validation - Additional model configuration parameters from the original verifier model -- Keys for tokenizer compatability validation +- Keys for tokenizer compatibility validation - `vocab_size`: Size of the vocabulary used by the tokenizer - `max_position_embeddings`: Maximum position embeddings supported - `bos_token_id`: Beginning of sequence token ID @@ -194,7 +194,7 @@ Example of a verifier configuration: 128001, 128008, 128009 - ] + ] } ``` @@ -214,7 +214,7 @@ For detailed usage and examples, refer to the [entrypoints documentation](./entr #### SpeculatorConfig Class -The `SpeculatorConfig` class extends the Transformers `PretrainedConfig` class, providing compatibile APIs with the following methods: +The `SpeculatorConfig` class extends the Transformers `PretrainedConfig` class, providing compatible APIs with the following methods: - `from_pretrained()`: Loads a speculator configuration from a local directory or the Hugging Face Hub - `save_pretrained()`: Saves a speculator configuration to a local directory @@ -224,7 +224,7 @@ For detailed usage and examples, refer to the [entrypoints documentation](./entr ## Examples -This section provides example `config.json` files for popular speculative decoding algorithms. +This section provides examples of `config.json` files for popular speculative decoding algorithms. ### Eagle From 3bb1fe111c8e9786b6650dbe97805135fb6ddd41 Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 20 May 2025 22:06:48 +0000 Subject: [PATCH 3/5] minor fixes for json samples spacing --- docs/architecture/model_formats.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/architecture/model_formats.md b/docs/architecture/model_formats.md index d137072..888e30a 100644 --- a/docs/architecture/model_formats.md +++ b/docs/architecture/model_formats.md @@ -144,14 +144,14 @@ Example usage in a config: ```json "proposal_methods": [ - { + { "proposal_type": "greedy", "...": "..." - }, - { + }, + { "proposal_type": "sample", "...": "..." - } + } ], "default_proposal_method": "greedy" ``` @@ -194,7 +194,7 @@ Example of a verifier configuration: 128001, 128008, 128009 - ] + ] } ``` From 9d876758ac6f4ab6e9496cacb698843b8e2e1bb4 Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Tue, 20 May 2025 22:22:59 +0000 Subject: [PATCH 4/5] Add related resources at the end of the doc --- docs/architecture/model_formats.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/architecture/model_formats.md b/docs/architecture/model_formats.md index 888e30a..300a8b6 100644 --- a/docs/architecture/model_formats.md +++ b/docs/architecture/model_formats.md @@ -530,3 +530,15 @@ This section provides examples of `config.json` files for popular speculative de }, } ``` + +## Related Resources + +Related Docs: + +- [Entrypoints Overview](./entrypoints.md) - This doc provides detailed information about saving and loading speculators. +- [Architecture Overview](./architecture.md) - This doc provides detailed information about the Speculators architecture that powers the entry points. + +Related External Resources: + +- [Speculative Decoding Overview](https://arxiv.org/abs/2401.07851) - A general-purpose, survey paper on speculative decoding, its benefits, and its applications. +- [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) - Documentation for the Transformers library, which Speculators integrates with From 287c10352368b5a5f7b5666e58a5a47522e92862 Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Wed, 21 May 2025 18:53:35 +0000 Subject: [PATCH 5/5] Update configs and examples with expected arguments and values --- docs/architecture/model_formats.md | 268 ++++++++++++++++++++++------- 1 file changed, 207 insertions(+), 61 deletions(-) diff --git a/docs/architecture/model_formats.md b/docs/architecture/model_formats.md index 300a8b6..7fa5fe0 100644 --- a/docs/architecture/model_formats.md +++ b/docs/architecture/model_formats.md @@ -50,7 +50,7 @@ The `config.json` file represents specifics about the draft model's architecture - `torch_dtype`: The data type used for the model weights (e.g., "float32", "bfloat16", "float16") - `transformers_version`: The version of Transformers used to create the model - `speculators_version`: The version of Speculators used to create the model -- `inputs`: A list defining the expected inputs to the draft model about the attachment points to the verifier (e.g., "input_ids", "layer.0") +- `inputs`: A list defining the expected inputs to the draft model about the attachment points to the verifier (e.g., "input_ids", "input_embeddings", "layer.0") Additionally, the file contains implementation-specific keys that define the draft model's architecture and hyperparameters, such as hidden layer dimensions, activation functions, and other configuration parameters depending on the specific Drafter architecture. @@ -63,7 +63,7 @@ Example of a minimal `config.json`: "torch_dtype": "bfloat16", "transformers_version": "4.35.0", "speculators_version": "0.1.0", - "inputs": ["input_ids"], + "inputs": ["input_embeddings"], "...": "..." } ``` @@ -117,9 +117,11 @@ This field must match one of the supported algorithms in the Speculators library Example usage in a config: ```json -"speculators_config": { - "algorithm": "eagle", - "...": "..." +{ + "speculators_config": { + "algorithm": "eagle", + "...": "..." + } } ``` @@ -143,17 +145,28 @@ Some common proposal methods include: Example usage in a config: ```json -"proposal_methods": [ - { - "proposal_type": "greedy", - "...": "..." - }, - { - "proposal_type": "sample", - "...": "..." - } -], -"default_proposal_method": "greedy" +{ + "proposal_methods": [ + { + "proposal_type": "greedy", + "draft_tokens": 5 + }, + { + "proposal_type": "sample", + "draft_tokens": 5, + "temperature": 0.8, + "top_p": 0.5 + }, + { + "proposal_type": "tree", + "tree_type": "static", + "initial_branching_factor": 4, + "branching_factor": 2, + "draft_tokens": 5 + } + ], + "default_proposal_method": "greedy" +} ``` #### Verifier @@ -182,19 +195,21 @@ There are several required and optional keys within the `verifier` dict, enablin Example of a verifier configuration: ```json -"verifier": { - "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", - "architectures": ["LlamaForCausalLM"], - "hidden_size": 4096, - "intermediate_size": 14336, - "vocab_size": 128256, - "max_position_embeddings": 131072, - "bos_token_id": 128000, - "eos_token_id": [ - 128001, - 128008, - 128009 - ] +{ + "verifier": { + "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": ["LlamaForCausalLM"], + "hidden_size": 4096, + "intermediate_size": 14336, + "vocab_size": 128256, + "max_position_embeddings": 131072, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ] + } } ``` @@ -235,24 +250,57 @@ This section provides examples of `config.json` files for popular speculative de "torch_dtype": "bfloat16", "transformers_version": "X.X.X", "speculators_version": "X.X.X", - "inputs": ["input_ids", "TODO"], - "extra_args": "TODO", + "inputs": ["input_embeddings", "hidden_states[-2]"], + "inputs_hidden_states_after_layer_norm": false, + "transformer_architecture": "LlamaDecoderLayer", + "transformer_input_type": "projection", + "transformer_include_output_layer_norm": false, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "use_cache": true, + "vocab_size": 128256, "speculators_config": { "algorithm": "eagle", "proposal_methods": [ { "proposal_type": "greedy", - "max_new_tokens": 5, + "draft_tokens": 5 }, { "proposal_type": "sample", - "max_new_tokens": 5, + "draft_tokens": 5, "temperature": 0.8, "top_p": 0.5 }, { "proposal_type": "tree", - "extra_args": "TODO" + "tree_type": "static", + "initial_branching_factor": 4, + "branching_factor": 2, + "depth": 5 } ], "default_proposal_method": "tree", @@ -283,24 +331,57 @@ This section provides examples of `config.json` files for popular speculative de "torch_dtype": "bfloat16", "transformers_version": "X.X.X", "speculators_version": "X.X.X", - "inputs": ["input_ids", "TODO"], - "extra_args": "TODO", + "inputs": ["input_embeddings", "hidden_states[-2]"], + "inputs_hidden_states_after_layer_norm": false, + "transformer_architecture": "LlamaDecoderLayer", + "transformer_input_type": "projection", + "transformer_include_output_layer_norm": false, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "use_cache": true, + "vocab_size": 128256, "speculators_config": { "algorithm": "eagle_2", "proposal_methods": [ { "proposal_type": "greedy", - "max_new_tokens": 5, + "draft_tokens": 5 }, { "proposal_type": "sample", - "max_new_tokens": 5, + "draft_tokens": 5, "temperature": 0.8, "top_p": 0.5 }, { "proposal_type": "tree", - "extra_args": "TODO" + "tree_type": "context_aware", + "max_tokens": 64, + "max_branching_factor": 5, + "max_depth": 5 } ], "default_proposal_method": "tree", @@ -326,29 +407,62 @@ This section provides examples of `config.json` files for popular speculative de ```json { - "architectures": ["TransformerDraftModel"], - "model_type": "transformer_draft_model", + "architectures": ["FusedTransformerDraftModel"], + "model_type": "fused_transformer_draft_model", "torch_dtype": "bfloat16", "transformers_version": "X.X.X", "speculators_version": "X.X.X", - "inputs": ["input_ids", "TODO"], - "extra_args": "TODO", + "inputs": ["input_ids", "hidden[3]", "hidden[12]", "hidden[-2]"], + "inputs_hidden_states_after_layer_norm": false, + "transformer_architecture": "LlamaDecoderLayer", + "transformer_input_type": "projection", + "transformer_include_output_layer_norm": false, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "use_cache": true, + "vocab_size": 128256, "speculators_config": { "algorithm": "eagle_3", "proposal_methods": [ { "proposal_type": "greedy", - "max_new_tokens": 5, + "draft_tokens": 5 }, { "proposal_type": "sample", - "max_new_tokens": 5, + "draft_tokens": 5, "temperature": 0.8, "top_p": 0.5 }, { "proposal_type": "tree", - "extra_args": "TODO" + "tree_type": "context_aware", + "max_tokens": 64, + "max_branching_factor": 5, + "max_depth": 5 } ], "default_proposal_method": "tree", @@ -379,24 +493,57 @@ This section provides examples of `config.json` files for popular speculative de "torch_dtype": "bfloat16", "transformers_version": "X.X.X", "speculators_version": "X.X.X", - "inputs": ["input_ids", "TODO"], - "extra_args": "TODO", + "inputs": ["input_embeddings", "hidden_states[-2]"], + "inputs_hidden_states_after_layer_norm": false, + "transformer_architecture": "LlamaDecoderLayer", + "transformer_input_type": "projection_with_bias", + "transformer_include_output_layer_norm": false, + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "use_cache": true, + "vocab_size": 128256, "speculators_config": { "algorithm": "hass", "proposal_methods": [ { "proposal_type": "greedy", - "max_new_tokens": 5, + "draft_tokens": 5 }, { "proposal_type": "sample", - "max_new_tokens": 5, + "draft_tokens": 5, "temperature": 0.8, "top_p": 0.5 }, { "proposal_type": "tree", - "extra_args": "TODO" + "tree_type": "context_aware", + "max_tokens": 64, + "max_branching_factor": 5, + "max_depth": 5 } ], "default_proposal_method": "tree", @@ -427,23 +574,22 @@ This section provides examples of `config.json` files for popular speculative de "torch_dtype": "bfloat16", "transformers_version": "X.X.X", "speculators_version": "X.X.X", - "inputs": ["input_ids", "TODO"], - "extra_args": "TODO", + "inputs": ["input_embeddings", "hidden_states[-1]"], + "inputs_hidden_states_after_layer_norm": false, + "hidden_size": 4096, + "intermediate_size": 4096, + "vocab_size": 128256, + "draft_tokens": 5, + "tie_weights": false, "speculators_config": { "algorithm": "mlp_speculator", "proposal_methods": [ { "proposal_type": "greedy", - "max_new_tokens": 5, - }, - { - "proposal_type": "sample", - "max_new_tokens": 5, - "temperature": 0.8, - "top_p": 0.5 + "draft_tokens": 5 } ], - "default_proposal_method": "sample", + "default_proposal_method": "greedy", "verifier": { "name_or_path": "meta-llama/Llama-3.1-8B-Instruct", "architectures": ["LlamaForCausalLM"],