ml-explore
diff --git a/‎Libraries/MLXLLM/LLMModelFactory.swift
Lines changed: 9 additions & 0 deletions b/‎Libraries/MLXLLM/LLMModelFactory.swift
Lines changed: 9 additions & 0 deletions
diff --git a/‎Libraries/MLXLLM/Models/Gemma.swift
Lines changed: 6 additions & 6 deletions b/‎Libraries/MLXLLM/Models/Gemma.swift
Lines changed: 6 additions & 6 deletions
diff --git a/‎Libraries/MLXLLM/Models/Gemma2.swift
Lines changed: 10 additions & 26 deletions b/‎Libraries/MLXLLM/Models/Gemma2.swift
Lines changed: 10 additions & 26 deletions
@@ -42,6 +42,7 @@ public class LLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "cohere": create(CohereConfiguration.self, CohereModel.init),
             "openelm": create(OpenElmConfiguration.self, OpenELMModel.init),
             "internlm2": create(InternLM2Configuration.self, InternLM2Model.init),
+            "gemma3_text": create(Gemma3TextConfiguration.self, Gemma3TextModel.init),
             "granite": create(GraniteConfiguration.self, GraniteModel.init),
             "mimo": create(MiMoConfiguration.self, MiMoModel.init),
             "glm4": create(GLM4Configuration.self, GLM4Model.init),
@@ -197,6 +198,12 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         defaultPrompt: "What is the difference between a fruit and a vegetable?"
     )
 
+    static public let gemma3_1B_qat_4bit = ModelConfiguration(
+        id: "mlx-community/gemma-3-1b-it-qat-4bit",
+        defaultPrompt: "What is the difference between a fruit and a vegetable?",
+        extraEOSTokens: ["<end_of_turn>"]
+    )
+
     static public let granite3_3_2b_4bit = ModelConfiguration(
         id: "mlx-community/granite-3.3-2b-instruct-4bit",
         defaultPrompt: ""
@@ -244,6 +251,7 @@ public class LLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             qwen3_8b_4bit,
             qwen3MoE_30b_a3b_4bit,
             smolLM_135M_4bit,
+            gemma3_1B_qat_4bit,
             mimo_7b_sft_4bit,
             glm4_9b_4bit,
             acereason_7b_4bit,
@@ -275,6 +283,7 @@ private struct LLMUserInputProcessor: UserInputProcessor {
         do {
             let promptTokens = try tokenizer.applyChatTemplate(
                 messages: messages, tools: input.tools, additionalContext: input.additionalContext)
+
             return LMInput(tokens: MLXArray(promptTokens))
         } catch TokenizerError.missingChatTemplate {
             print(
 
@@ -108,15 +108,15 @@ private class TransformerBlock: Module {
     @ModuleInfo(key: "self_attn") var attention: Attention
     let mlp: MLP
 
-    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
-    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: Gemma.RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: Gemma.RMSNorm
 
     public init(_ args: GemmaConfiguration) {
         self._attention.wrappedValue = Attention(args)
         self.mlp = MLP(dimensions: args.hiddenSize, hiddenDimensions: args.intermediateSize)
-        self._inputLayerNorm.wrappedValue = RMSNorm(
+        self._inputLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
-        self._postAttentionLayerNorm.wrappedValue = RMSNorm(
+        self._postAttentionLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
     }
 
@@ -137,7 +137,7 @@ private class GemmaModelInner: Module {
 
     @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
     fileprivate let layers: [TransformerBlock]
-    fileprivate let norm: RMSNorm
+    fileprivate let norm: Gemma.RMSNorm
 
     public init(_ args: GemmaConfiguration) {
         precondition(args.vocabularySize > 0)
@@ -153,7 +153,7 @@ private class GemmaModelInner: Module {
             .map { _ in
                 TransformerBlock(args)
             }
-        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        self.norm = Gemma.RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
     }
 
     public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {
 
@@ -8,22 +8,6 @@ import Tokenizers
 
 // Port of https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/models/gemma2.py
 
-// Specialized norm for gemma
-private class RMSNorm: Module, UnaryLayer {
-    let weight: MLXArray
-    let eps: Float
-
-    public init(dimensions: Int, eps: Float = 1e-5) {
-        self.weight = MLXArray.ones([dimensions])
-        self.eps = eps
-        super.init()
-    }
-
-    public func callAsFunction(_ x: MLXArray) -> MLXArray {
-        return MLXFast.rmsNorm(x, weight: 1.0 + self.weight, eps: self.eps)
-    }
-}
-
 private class Attention: Module {
     let args: Gemma2Configuration
     let scale: Float
@@ -125,21 +109,21 @@ private class TransformerBlock: Module {
     @ModuleInfo(key: "self_attn") var attention: Attention
     let mlp: MLP
 
-    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: RMSNorm
-    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayerNorm: RMSNorm
-    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayerNorm: RMSNorm
-    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: RMSNorm
+    @ModuleInfo(key: "input_layernorm") var inputLayerNorm: Gemma.RMSNorm
+    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayerNorm: Gemma.RMSNorm
+    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayerNorm: Gemma.RMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayerNorm: Gemma.RMSNorm
 
     public init(_ args: Gemma2Configuration) {
         self._attention.wrappedValue = Attention(args)
         self.mlp = MLP(dimensions: args.hiddenSize, hiddenDimensions: args.intermediateSize)
-        self._inputLayerNorm.wrappedValue = RMSNorm(
+        self._inputLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
-        self._preFeedforwardLayerNorm.wrappedValue = RMSNorm(
+        self._preFeedforwardLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
-        self._postFeedforwardLayerNorm.wrappedValue = RMSNorm(
+        self._postFeedforwardLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
-        self._postAttentionLayerNorm.wrappedValue = RMSNorm(
+        self._postAttentionLayerNorm.wrappedValue = Gemma.RMSNorm(
             dimensions: args.hiddenSize, eps: args.rmsNormEps)
     }
 
@@ -159,7 +143,7 @@ private class ModelInner: Module {
     @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
 
     fileprivate let layers: [TransformerBlock]
-    fileprivate let norm: RMSNorm
+    fileprivate let norm: Gemma.RMSNorm
 
     let hiddenScale: Float
 
@@ -175,7 +159,7 @@ private class ModelInner: Module {
             .map { _ in
                 TransformerBlock(args)
             }
-        self.norm = RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
+        self.norm = Gemma.RMSNorm(dimensions: args.hiddenSize, eps: args.rmsNormEps)
     }
 
     public func callAsFunction(_ inputs: MLXArray, cache: [KVCache]? = nil) -> MLXArray {