load gemma3 dwq VLM models as LLM models (#343)

davidkoski · web-flow · commit 7482e98d828f · 2025-06-30T13:37:26.000-07:00
* load gemma3 dwq VLM models as LLM models
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -43,6 +43,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
             "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
             "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
+            "gemma3": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
             "granite": create(GraniteConfiguration.self, GraniteModel.init),
             "mimo": create(MiMoConfiguration.self, MiMoModel.init),
             "glm4": create(GLM4Configuration.self, GLM4Model.init),
diff --git a/Libraries/MLXLLM/Models/Gemma3Text.swift b/Libraries/MLXLLM/Models/Gemma3Text.swift
@@ -49,8 +49,21 @@ public struct Gemma3TextConfiguration: Codable {
         case slidingWindowPattern = "sliding_window_pattern"
     }
 
+    enum VLMCodingKeys: String, CodingKey {
+        case textConfig = "text_config"
+    }
+
     public init(from decoder: Decoder) throws {
-        let container = try decoder.container(keyedBy: CodingKeys.self)
+        let nestedContainer = try decoder.container(keyedBy: VLMCodingKeys.self)
+
+        // in the case of VLM models convertered using mlx_lm.convert
+        // the configuration will still match the VLMs and be under text_config
+        let container =
+            if nestedContainer.contains(.textConfig) {
+                try nestedContainer.nestedContainer(keyedBy: CodingKeys.self, forKey: .textConfig)
+            } else {
+                try decoder.container(keyedBy: CodingKeys.self)
+            }
 
         modelType = try container.decode(String.self, forKey: .modelType)
         hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
@@ -339,6 +352,14 @@ public class Gemma3TextModel: Module, LLMModel {
         -> [String: MLXArray]
     {
         var processedWeights = weights
+
+        // VLM models converted using mlx_vlm.convert will still have
+        // the weights are under a language_model key
+        let unflattened = ModuleParameters.unflattened(weights)
+        if let lm = unflattened["language_model"] {
+            processedWeights = Dictionary(uniqueKeysWithValues: lm.flattened())
+        }
+
         if processedWeights["lm_head.weight"] == nil {
             if let embedWeight = processedWeights["model.embed_tokens.weight"] {
                 processedWeights["lm_head.weight"] = embedWeight