ml-explore
diff --git a/‎Libraries/Embedders/BaseConfiguration.swift
Lines changed: 156 additions & 0 deletions b/‎Libraries/Embedders/BaseConfiguration.swift
Lines changed: 156 additions & 0 deletions
diff --git a/‎Libraries/Embedders/Configuration.swift
Lines changed: 0 additions & 26 deletions b/‎Libraries/Embedders/Configuration.swift
Lines changed: 0 additions & 26 deletions
diff --git a/‎Libraries/Embedders/Load.swift
Lines changed: 35 additions & 1 deletion b/‎Libraries/Embedders/Load.swift
Lines changed: 35 additions & 1 deletion
diff --git a/‎Libraries/MLXLLM/LLMModelFactory.swift
Lines changed: 2 additions & 1 deletion b/‎Libraries/MLXLLM/LLMModelFactory.swift
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,156 @@
+// Copyright © 2025 Apple Inc.
+
+import Foundation
+
+/// Base ``LanguageModel`` configuration -- provides `modelType`
+/// and `quantization` (used in loading the model).
+///
+/// This is used by ``ModelFactory/load(hub:configuration:progressHandler:)``
+/// to determine the type of model to load.
+public struct BaseConfiguration: Codable, Sendable {
+    public let modelType: String
+
+    public struct Quantization: Codable, Sendable, Equatable {
+        public init(groupSize: Int, bits: Int) {
+            self.groupSize = groupSize
+            self.bits = bits
+        }
+
+        public let groupSize: Int
+        public let bits: Int
+
+        public var asTuple: (Int, Int) { (groupSize, bits) }
+
+        enum CodingKeys: String, CodingKey {
+            case groupSize = "group_size"
+            case bits = "bits"
+        }
+    }
+
+    /// handling instructions for ``PerLayerQuantization``
+    public enum QuantizationOption: Sendable {
+        case skip
+        case quantize(Quantization)
+    }
+
+    /// Per-layer ``Quantization`` values with optional default.
+    public struct PerLayerQuantization: Sendable {
+        public var quantization: Quantization? = nil
+        public var perLayerQuantization: [String: QuantizationOption]
+
+        public init(
+            quantization: BaseConfiguration.Quantization? = nil,
+            perLayerQuantization: [String: BaseConfiguration.QuantizationOption]
+        ) {
+            self.quantization = quantization
+            self.perLayerQuantization = perLayerQuantization
+        }
+
+        /// The quantization to apply for the given layer name or nil for no quantization.
+        public func quantization(layer: String) -> Quantization? {
+            if let perLayer = perLayerQuantization[layer] {
+                switch perLayer {
+                case .skip:
+                    return nil
+                case .quantize(let quantization):
+                    return quantization
+                }
+            } else {
+                return quantization
+            }
+        }
+    }
+
+    /// Special codable to support a mixed key: Int / key: Quantization
+    /// structure for hereogenous quantization, e.g.
+    ///
+    /// ```
+    /// "quantization": {
+    ///     "group_size": 64,
+    ///     "bits": 4,
+    ///     "model.embed_tokens": {
+    ///         "group_size": 32,
+    ///         "bits": 4
+    ///     },
+    ///     "model.layers.0.self_attn.q_norm": false,
+    /// ```
+    ///
+    /// This mixed type structure requires manual decoding.
+    struct QuantizationContainer: Codable, Sendable {
+        var quantization: Quantization
+        var perLayerQuantization: PerLayerQuantization
+
+        // based on Dictionary's coding key
+        internal struct _DictionaryCodingKey: CodingKey {
+            internal let stringValue: String
+            internal let intValue: Int?
+
+            internal init(stringValue: String) {
+                self.stringValue = stringValue
+                self.intValue = Int(stringValue)
+            }
+
+            internal init(intValue: Int) {
+                self.stringValue = "\(intValue)"
+                self.intValue = intValue
+            }
+        }
+
+        init(from decoder: any Decoder) throws {
+            // handle the embedded Quantization
+            self.quantization = try Quantization(from: decoder)
+
+            // and the interleaved per-layer values
+            var perLayerQuantization = [String: QuantizationOption]()
+            let container = try decoder.container(keyedBy: _DictionaryCodingKey.self)
+            for key in container.allKeys {
+                switch key.stringValue {
+                case Quantization.CodingKeys.groupSize.rawValue: continue
+                case Quantization.CodingKeys.bits.rawValue: continue
+
+                default:
+                    if let f = try? container.decode(Bool.self, forKey: key) {
+                        if !f {
+                            perLayerQuantization[key.stringValue] = .skip
+                        }
+                    } else {
+                        perLayerQuantization[key.stringValue] = .quantize(
+                            try container.decode(Quantization.self, forKey: key))
+                    }
+                }
+            }
+            self.perLayerQuantization = PerLayerQuantization(
+                quantization: quantization, perLayerQuantization: perLayerQuantization)
+        }
+
+        func encode(to encoder: any Encoder) throws {
+            try quantization.encode(to: encoder)
+
+            var container = encoder.container(keyedBy: _DictionaryCodingKey.self)
+            for (key, value) in perLayerQuantization.perLayerQuantization {
+                switch value {
+                case .skip:
+                    try container.encode(false, forKey: .init(stringValue: key))
+                case .quantize(let q):
+                    try container.encode(q, forKey: .init(stringValue: key))
+                }
+            }
+        }
+    }
+
+    var quantizationContainer: QuantizationContainer?
+
+    @available(*, deprecated, message: "Please use perLayerQuantization instead")
+    public var quantization: Quantization? {
+        quantizationContainer?.quantization
+    }
+
+    public var perLayerQuantization: PerLayerQuantization? {
+        quantizationContainer?.perLayerQuantization
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case modelType = "model_type"
+        case quantizationContainer = "quantization"
+    }
+}
@@ -110,29 +110,3 @@ public struct ModelType: RawRepresentable, Codable, Sendable {
         try modelTypeRegistry.createModel(configuration: configuration, rawValue: rawValue)
     }
 }
-
-public struct BaseConfiguration: Codable, Sendable {
-    public let modelType: ModelType
-
-    public struct Quantization: Codable, Sendable {
-        public init(groupSize: Int, bits: Int) {
-            self.groupSize = groupSize
-            self.bits = bits
-        }
-
-        let groupSize: Int
-        let bits: Int
-
-        enum CodingKeys: String, CodingKey {
-            case groupSize = "group_size"
-            case bits = "bits"
-        }
-    }
-
-    public var quantization: Quantization?
-
-    enum CodingKeys: String, CodingKey {
-        case modelType = "model_type"
-        case quantization
-    }
-}
@@ -62,7 +62,8 @@ func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {
     let baseConfig = try JSONDecoder().decode(
         BaseConfiguration.self, from: Data(contentsOf: configurationURL))
 
-    let model = try baseConfig.modelType.createModel(configuration: configurationURL)
+    let modelType = ModelType(rawValue: baseConfig.modelType)
+    let model = try modelType.createModel(configuration: configurationURL)
 
     // load the weights
     var weights = [String: MLXArray]()
@@ -81,6 +82,16 @@ func loadSynchronous(modelDirectory: URL) throws -> EmbeddingModel {
     weights = model.sanitize(weights: weights)
 
     // quantize if needed
+    if let perLayerQuantization = baseConfig.perLayerQuantization {
+        quantize(model: model) { path, module in
+            if weights["\(path).scales"] != nil {
+                return perLayerQuantization.quantization(layer: path)?.asTuple
+            } else {
+                return nil
+            }
+        }
+    }
+
     if let quantization = baseConfig.quantization {
         quantize(model: model, groupSize: quantization.groupSize, bits: quantization.bits) {
             path, module in
@@ -108,3 +119,26 @@ public func loadModelContainer(
     return try await ModelContainer(
         hub: hub, modelDirectory: modelDirectory, configuration: configuration)
 }
+
+// TODO remove once mlx-swift update is adopted
+func quantize(
+    model: Module,
+    filter: (String, Module) -> (groupSize: Int, bits: Int)?,
+    apply: (Module, Int, Int) -> Module? = quantizeSingle(layer:groupSize:bits:)
+) {
+    let updates =
+        model
+        .leafModules()
+        .flattened()
+        .compactMap { (path, m) -> (String, Module)? in
+            if let (groupSize, bits) = filter(path, m) {
+                if let quantized = apply(m, groupSize, bits) {
+                    return (path, quantized)
+                }
+            }
+
+            return nil
+        }
+
+    model.update(modules: ModuleChildren.unflattened(updates))
+}
@@ -325,7 +325,8 @@ public class LLMModelFactory: ModelFactory {
 
         // apply the weights to the bare model
         try loadWeights(
-            modelDirectory: modelDirectory, model: model, quantization: baseConfig.quantization)
+            modelDirectory: modelDirectory, model: model,
+            perLayerQuantization: baseConfig.perLayerQuantization)
 
         let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub)