✨ feat: Add support for Xiaomi MiMo model (#306)

johnmai-dev · web-flow · commit 3013e713017f · 2025-05-02T14:01:31.000-07:00
* ✨ feat: Add support for MiMo model
diff --git a/Libraries/MLXLLM/Documentation.docc/Documentation.md b/Libraries/MLXLLM/Documentation.docc/Documentation.md
@@ -30,6 +30,5 @@ Example implementations of various Large Language Models (LLMs).
 - ``Qwen2Model``
 - ``Qwen3Model``
 - ``Starcoder2Model``
+- ``MiMoModel``
 - ``GLM4Model``
-
-
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -43,6 +43,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
             "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
             "granite": create(GraniteConfiguration.self, GraniteModel.init),
+            "mimo": create(MiMoConfiguration.self, MiMoModel.init),
             "glm4": create(GLM4Configuration.self, GLM4Model.init),
         ]
     }
@@ -200,6 +201,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         defaultPrompt: ""
     )
 
+    static public let mimo_7b_sft_4bit = ModelConfiguration(
+        id: "mlx-community/MiMo-7B-SFT-4bit",
+        defaultPrompt: "Why is the sky blue?"
+    )
+
     static public let glm4_9b_4bit = ModelConfiguration(
         id: "mlx-community/GLM-4-9B-0414-4bit",
         defaultPrompt: "Why is the sky blue?"
@@ -231,6 +237,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             qwen3_4b_4bit,
             qwen3_8b_4bit,
             smolLM_135M_4bit,
+            mimo_7b_sft_4bit,
             glm4_9b_4bit,
         ]
     }
diff --git a/Libraries/MLXLLM/Models/MiMo.swift b/Libraries/MLXLLM/Models/MiMo.swift
@@ -0,0 +1,273 @@
+//
+//  MiMo.swift
+//  LLM
+//
+//  Created by John Mai on 2025/5/3.
+//
+
+import Foundation
+import MLX
+import MLXFast
+import MLXLMCommon
+import MLXNN
+
+private class Attention: Module {
+    let args: MiMoConfiguration
+    let scale: Float
+
+    @ModuleInfo(key: "q_proj") var wq: Linear
+    @ModuleInfo(key: "k_proj") var wk: Linear
+    @ModuleInfo(key: "v_proj") var wv: Linear
+    @ModuleInfo(key: "o_proj") var wo: Linear
+
+    let rope: RoPE
+
+    public init(_ args: MiMoConfiguration) {
+        self.args = args
+
+        let dim = args.hiddenSize
+        let heads = args.attentionHeads
+        let kvHeads = args.kvHeads
+
+        let headDim = args.hiddenSize / heads
+        self.scale = pow(Float(headDim), -0.5)
+
+        _wq.wrappedValue = Linear(dim, heads * headDim, bias: true)
+        _wk.wrappedValue = Linear(dim, kvHeads * headDim, bias: true)
+        _wv.wrappedValue = Linear(dim, kvHeads * headDim, bias: true)
+        _wo.wrappedValue = Linear(heads * headDim, dim, bias: false)
+
+        let ropeScale: Float
+        if let ropeScaling = args.ropeScaling, ropeScaling["type"] == .string("linear"),
+            let factor = ropeScaling["factor"]
+        {
+            if let v = factor.asFloat() {
+                ropeScale = 1 / v
+            } else {
+                fatalError("ropeScaling.factor must be a float")
+            }
+        } else {
+            ropeScale = 1
+        }
+
+        self.rope = RoPE(
+            dimensions: headDim, traditional: args.ropeTraditional, base: args.ropeTheta,
+            scale: ropeScale)
+    }
+
+    public func callAsFunction(
+        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+    ) -> MLXArray {
+        let (B, L) = (x.dim(0), x.dim(1))
+
+        var queries = wq(x)
+        var keys = wk(x)
+        var values = wv(x)
+
+        // prepare the queries, keys and values for the attention computation
+        queries = queries.reshaped(B, L, args.attentionHeads, -1).transposed(0, 2, 1, 3)
+        keys = keys.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
+        values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
+
+        if let cache {
+            queries = rope(queries, offset: cache.offset)
+            keys = rope(keys, offset: cache.offset)
+            (keys, values) = cache.update(keys: keys, values: values)
+        } else {
+            queries = rope(queries)
+            keys = rope(keys)
+        }
+
+        let output = MLXFast.scaledDotProductAttention(
+            queries: queries, keys: keys, values: values, scale: scale, mask: mask
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(B, L, -1)
+
+        return wo(output)
+    }
+}
+
+private class MLP: Module, UnaryLayer {
+    @ModuleInfo(key: "gate_proj") var gate: Linear
+    @ModuleInfo(key: "down_proj") var down: Linear
+    @ModuleInfo(key: "up_proj") var up: Linear
+
+    public init(dimensions: Int, hiddenDimensions: Int) {
+        _gate.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+        _down.wrappedValue = Linear(hiddenDimensions, dimensions, bias: false)
+        _up.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
+    }
+
+    public func callAsFunction(_ x: MLXArray) -> MLXArray {
+        down(silu(gate(x)) * up(x))
+    }
+}
+
+private class TransformerBlock: Module {
+    @ModuleInfo(key: "self_attn") var attention: Attention
+    let mlp: MLP
+
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+
+    public init(_ args: MiMoConfiguration) {
+        _attention.wrappedValue = Attention(args)
+        self.mlp = MLP(dimensions: args.hiddenSize, hiddenDimensions: args.intermediateSize)
+        _inputLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        _postAttentionLayerNorm.wrappedValue = RMSNorm(
+            dimensions: args.hiddenSize, eps: args.rmsNormEps)
+    }
+
+    public func callAsFunction(
+        _ x: MLXArray, mask: MLXArray? = nil, cache: KVCache?
+    ) -> MLXArray {
+        var r = attention(inputLayerNorm(x), mask: mask, cache: cache)
+        let h = x + r
+        r = mlp(postAttentionLayerNorm(h))
+        let out = h + r
+        return out
+    }
+}
+
+private class MiMoModelInner: Module {
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+
+    fileprivate let layers: [TransformerBlock]
+    let norm: RMSNorm
+
+    let numNextnPredictLayers: Int
+
+    public init(_ args: MiMoConfiguration) {
+        precondition(args.vocabularySize > 0)
+
+        _embedTokens.wrappedValue = Embedding(
+            embeddingCount: args.vocabularySize, dimensions: args.hiddenSize)
+
+        self.layers = (0 ..< args.hiddenLayers).map { _ in
+            TransformerBlock(args)
+        }
+        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        self.numNextnPredictLayers = args.numNextnPredictLayers
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
+        var h = embedTokens(inputs)
+
+        let mask = createAttentionMask(h: h, cache: cache)
+
+        for (i, layer) in layers.enumerated() {
+            h = layer(h, mask: mask, cache: cache?[i])
+        }
+
+        return norm(h)
+    }
+}
+
+public class MiMoModel: Module, LLMModel, KVCacheDimensionProvider {
+    public let vocabularySize: Int
+    public let kvHeads: [Int]
+
+    private let model: MiMoModelInner
+    let configuration: MiMoConfiguration
+
+    @ModuleInfo(key: "lm_head") var lmHead: Linear?
+
+    public init(_ args: MiMoConfiguration) {
+        self.configuration = args
+        self.vocabularySize = args.vocabularySize
+        self.kvHeads = (0 ..< args.hiddenLayers).map { _ in args.kvHeads }
+        self.model = MiMoModelInner(args)
+
+        if !args.tieWordEmbeddings {
+            _lmHead.wrappedValue = Linear(args.hiddenSize, args.vocabularySize, bias: false)
+        }
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
+        let out = model(inputs, cache: cache)
+
+        if let lmHead = lmHead {
+            return lmHead(out)
+        } else {
+            return model.embedTokens.asLinear(out)
+        }
+    }
+
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var weights = weights
+
+        if configuration.tieWordEmbeddings {
+            weights.removeValue(forKey: "lm_head.weight")
+        }
+
+        // Remove unused precomputed rotary freqs and mtp_layers
+        return weights.filter { key, _ in
+            !key.contains("self_attn.rotary_emb.inv_freq") && !key.hasPrefix("model.mtp_layers.")
+        }
+    }
+}
+
+public struct MiMoConfiguration: Codable, Sendable {
+    var hiddenSize: Int
+    var hiddenLayers: Int
+    var intermediateSize: Int
+    var attentionHeads: Int
+    var rmsNormEps: Float
+    var vocabularySize: Int
+    var kvHeads: Int
+    var maxPositionEmbeddings: Int
+    var ropeTheta: Float
+    var ropeTraditional: Bool
+    var ropeScaling: [String: StringOrNumber]?
+    var tieWordEmbeddings: Bool
+    var numNextnPredictLayers: Int
+
+    enum CodingKeys: String, CodingKey {
+        case hiddenSize = "hidden_size"
+        case hiddenLayers = "num_hidden_layers"
+        case intermediateSize = "intermediate_size"
+        case attentionHeads = "num_attention_heads"
+        case rmsNormEps = "rms_norm_eps"
+        case vocabularySize = "vocab_size"
+        case kvHeads = "num_key_value_heads"
+        case maxPositionEmbeddings = "max_position_embeddings"
+        case ropeTheta = "rope_theta"
+        case ropeTraditional = "rope_traditional"
+        case ropeScaling = "rope_scaling"
+        case tieWordEmbeddings = "tie_word_embeddings"
+        case numNextnPredictLayers = "num_nextn_predict_layers"
+    }
+
+    public init(from decoder: Decoder) throws {
+        let container = try decoder.container(keyedBy: CodingKeys.self)
+
+        self.hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
+        self.hiddenLayers = try container.decode(Int.self, forKey: .hiddenLayers)
+        self.intermediateSize = try container.decode(Int.self, forKey: .intermediateSize)
+        self.attentionHeads = try container.decode(Int.self, forKey: .attentionHeads)
+        self.rmsNormEps = try container.decode(Float.self, forKey: .rmsNormEps)
+        self.vocabularySize = try container.decode(Int.self, forKey: .vocabularySize)
+        self.kvHeads = try container.decode(Int.self, forKey: .kvHeads)
+        self.maxPositionEmbeddings =
+            try container.decodeIfPresent(Int.self, forKey: .maxPositionEmbeddings) ?? 32768
+        self.ropeTheta = try container.decodeIfPresent(Float.self, forKey: .ropeTheta) ?? 10000.0
+        self.ropeTraditional =
+            try container.decodeIfPresent(Bool.self, forKey: .ropeTraditional) ?? false
+        self.ropeScaling = try container.decodeIfPresent(
+            [String: StringOrNumber].self, forKey: .ropeScaling)
+        self.tieWordEmbeddings =
+            try container.decodeIfPresent(Bool.self, forKey: .tieWordEmbeddings) ?? false
+        self.numNextnPredictLayers =
+            try container.decodeIfPresent(Int.self, forKey: .numNextnPredictLayers) ?? 2
+    }
+}
+
+// MARK: - LoRA
+
+extension MiMoModel: LoRAModel {
+    public func loraLinearLayers() -> LoRALinearLayers {
+        model.layers.map { ($0.attention, ["q_proj", "v_proj"]) }
+    }
+}
diff --git a/Libraries/MLXLLM/README.md b/Libraries/MLXLLM/README.md
@@ -56,6 +56,7 @@ Currently supported model types are:
 - Qwen2
 - Qwen3
 - Starcoder2
+- MiMo
 - GLM4
 
 See [llm-tool](../../Tools/llm-tool)

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {`
`43`	`43`	`"openelm": create(OpenElmConfiguration.self, OpenELMModel.init),`
`44`	`44`	`"internlm2": create(InternLM2Configuration.self, InternLM2Model.init),`
`45`	`45`	`"granite": create(GraniteConfiguration.self, GraniteModel.init),`
	`46`	`+ "mimo": create(MiMoConfiguration.self, MiMoModel.init),`
`46`	`47`	`"glm4": create(GLM4Configuration.self, GLM4Model.init),`
`47`	`48`	`]`
`48`	`49`	`}`
`@@ -200,6 +201,11 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`200`	`201`	`defaultPrompt: ""`
`201`	`202`	`)`
`202`	`203`
	`204`	`+ static public let mimo_7b_sft_4bit = ModelConfiguration(`
	`205`	`+ id: "mlx-community/MiMo-7B-SFT-4bit",`
	`206`	`+ defaultPrompt: "Why is the sky blue?"`
	`207`	`+ )`
	`208`	`+`
`203`	`209`	`static public let glm4_9b_4bit = ModelConfiguration(`
`204`	`210`	`id: "mlx-community/GLM-4-9B-0414-4bit",`
`205`	`211`	`defaultPrompt: "Why is the sky blue?"`
`@@ -231,6 +237,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {`
`231`	`237`	`qwen3_4b_4bit,`
`232`	`238`	`qwen3_8b_4bit,`
`233`	`239`	`smolLM_135M_4bit,`
	`240`	`+ mimo_7b_sft_4bit,`
`234`	`241`	`glm4_9b_4bit,`
`235`	`242`	`]`
`236`	`243`	`}`