-
Notifications
You must be signed in to change notification settings - Fork 16
Support DeepSeekV3-style block FP8 quantization #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
558870c
759be7a
010e903
7e642f7
bc5bea3
cda798e
da55b66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
|
||
import warnings | ||
from enum import Enum | ||
from typing import Any, Dict, Optional, Union | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
import torch | ||
from compressed_tensors.utils import Aliasable | ||
|
@@ -153,8 +153,8 @@ class QuantizationArgs(BaseModel, use_enum_values=True): | |
:param symmetric: whether or not quantization scale is symmetric about zero-point | ||
:param strategy: string id determining the scope of scale/zero-point to apply | ||
:param group_size: group length to use for the group strategy | ||
:param block_structure: 2d block structure to use for the block strategy, must be | ||
of the format "2x4", "8x16", etc. | ||
:param block_structure: 2d block structure to use for the block strategy; must be | ||
a list of two ints [rows, cols] like [128, 128]. | ||
:param dynamic: set True to perform dynamic quantization - values will not be | ||
calibrated during calibration phase, instead during inference new quantization | ||
ranges will be observed with every sample. Defaults to False for static | ||
|
@@ -169,7 +169,7 @@ class QuantizationArgs(BaseModel, use_enum_values=True): | |
symmetric: bool = True | ||
group_size: Optional[int] = None | ||
strategy: Optional[QuantizationStrategy] = None | ||
block_structure: Optional[str] = None | ||
block_structure: Optional[List[int]] = None | ||
shanjiaz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dynamic: Union[DynamicType, bool] = False | ||
actorder: Union[ActivationOrdering, bool, None] = None | ||
observer: Optional[str] = Field( | ||
|
@@ -207,6 +207,28 @@ def validate_group(cls, value) -> Union[int, None]: | |
|
||
return value | ||
|
||
@field_validator("block_structure", mode="before") | ||
def validate_block_structure(cls, value) -> Optional[List[int]]: | ||
if value is None: | ||
return value | ||
# For backward compatibility, allow string format "2x4", "8x16", etc. | ||
if isinstance(value, str): | ||
try: | ||
return [int(x) for x in value.split("x")] | ||
except Exception: | ||
raise ValueError( | ||
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]." | ||
) | ||
if isinstance(value, (list, tuple)): | ||
if len(value) != 2 or not all(isinstance(v, int) for v in value): | ||
raise ValueError( | ||
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]." | ||
) | ||
return list(value) | ||
raise ValueError( | ||
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]." | ||
) | ||
|
||
@field_validator("strategy", mode="before") | ||
def validate_strategy(cls, value) -> Union[QuantizationStrategy, None]: | ||
if isinstance(value, str): | ||
|
@@ -277,14 +299,15 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs": | |
|
||
# infer observer w.r.t. dynamic | ||
if dynamic: | ||
if strategy not in ( | ||
supported_strategies = ( | ||
QuantizationStrategy.TOKEN, | ||
QuantizationStrategy.TENSOR, | ||
QuantizationStrategy.TENSOR_GROUP, | ||
): | ||
QuantizationStrategy.GROUP, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is mostly an aesthetic choice, but it might have aesthetic consequences if vllm wants to support fused input-weight quantization. Ex There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might want to add some validation on quant_scheme related to this as well |
||
) | ||
if strategy not in supported_strategies: | ||
raise ValueError( | ||
f"One of {(QuantizationStrategy.TOKEN, QuantizationStrategy.TENSOR, QuantizationStrategy.TENSOR_GROUP)} " | ||
"must be used for dynamic quantization", | ||
f"One of {supported_strategies} must be used for dynamic quantization" | ||
) | ||
|
||
if ( | ||
|
Uh oh!
There was an error while loading. Please reload this page.