[Model] Keep vision encoder weights unquantized to maintain accuracy (#3028)

mengshyu · web-flow · commit 3578e79e01f5 · 2024-11-16T00:07:59.000-05:00
This PR ensures that vision encoder layers are excluded from
quantization, improving accuracy for models with vision components.
diff --git a/python/mlc_llm/model/vision/clip_vision.py b/python/mlc_llm/model/vision/clip_vision.py
@@ -218,6 +218,8 @@ def forward(self, pixel_values: Tensor) -> Tensor:
 
 
 class CLIPVisionModel(Module):
+    no_quantization: bool = True
+
     def __init__(self, config: CLIPVisionConfig):
         super().__init__()
         self.vision_model = CLIPVisionTransformer(config)
diff --git a/python/mlc_llm/quantization/group_quantization.py b/python/mlc_llm/quantization/group_quantization.py
@@ -111,6 +111,9 @@ def visit_module(self, name: str, node: nn.Module) -> Any:
                 ret_node: Any
                     The new node to replace current node.
                 """
+                if getattr(node, "no_quantization", False):
+                    return node
+
                 if (
                     isinstance(node, nn.Linear)
                     and (not is_final_fc(name) or self.config.quantize_final_fc)