Add configuration and processor

DePasqualeOrg · DePasqualeOrg · commit d9f78a4a05de · 2025-06-27T11:19:28.000+02:00
diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -68,7 +68,7 @@ public struct AudioConfig: Codable, Sendable, MultimodalConfig {
         sscpConvEps: Float = 1e-3,
         rmsNormEps: Float = 1e-6,
         gradientClipping: Float = 10000000000.0,
-        vocabOffset: Int = 262272
+        vocabOffset: Int = 262272  // 262_144 + 128 (text vocab size + vision vocab size)
     ) {
         self.inputFeatSize = inputFeatSize
         self.hiddenSize = hiddenSize
@@ -1580,6 +1580,18 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
 
         var inputsEmbeds = languageModel.model.embedTokens(inputIds)
 
+        // Ensure no gaps between text, vision, and audio embeddings, in that order
+        // This matches the Python assertion
+        assert(
+            embedAudio.vocabOffset == config.vocabSize - config.audioConfig.vocabSize,
+            "Audio vocab offset mismatch"
+        )
+        assert(
+            embedVision.vocabOffset == config.vocabSize - config.audioConfig.vocabSize
+                - config.visionConfig.vocabSize,
+            "Vision vocab offset mismatch"
+        )
+
         // Handle vision tokens
         if pixelValues != nil {
             let visionMask = logicalAnd(
@@ -1701,12 +1713,12 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         if let inputIds = inputIds {
             specialModalityMask = expandedDimensions(inputIds .== tokenId, axis: -1)
         } else {
-            let tokenEmbedding =
-                if modality == "audio" {
-                    embedAudio(MLXArray([tokenId]))
-                } else {
-                    languageModel.model.embedTokens(MLXArray([tokenId]))
-                }
+            // When inputIds is nil, create mask by comparing embeddings
+            let embedFn: (MLXArray) -> MLXArray =
+                modality == "audio"
+                ? { self.embedAudio($0, inputsEmbeds: nil) }
+                : { self.languageModel.model.embedTokens($0) }
+            let tokenEmbedding = embedFn(MLXArray([tokenId]))
             specialModalityMask = inputsEmbeds .== tokenEmbedding
         }
 
@@ -1718,8 +1730,8 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         guard modalityTokensInText == featureTokens else {
             fatalError(
                 """
-                    Number of \(modality)s does not match number of special \(modality) tokens in the input text.
-                    Got \(modalityTokensInText) \(modality) tokens in the text and \(featureTokens) tokens from \(modality) embeddings.
+                Number of \(modality)s does not match number of special \(modality) tokens in the input text.
+                Got \(modalityTokensInText) \(modality) tokens in the text and \(featureTokens) tokens from \(modality) embeddings.
                 """)
         }
 
@@ -3828,3 +3840,210 @@ extension Gemma3n: LoRAModel {
         }
     }
 }
+
+// MARK: - VLM Factory Configuration and Processor
+
+public struct Gemma3nConfiguration: Codable, Sendable {
+    public let textConfig: TextConfig
+    public let visionConfig: VisionConfig
+    public let audioConfig: AudioConfig
+    public let modelType: String
+    public let vocabSize: Int
+    public let ignoreIndex: Int
+    public let imageTokenIndex: Int
+    public let audioTokenId: Int
+    public let imageTokenId: Int
+    public let hiddenSize: Int
+    public let padTokenId: Int
+    public let visionSoftTokensPerImage: Int
+    public let audioSoftTokensPerImage: Int
+    public let eosTokenId: [Int]?
+    public let quantization: QuantizationConfig?
+
+    public var vocabularySize: Int { vocabSize }
+
+    enum CodingKeys: String, CodingKey {
+        case textConfig = "text_config"
+        case visionConfig = "vision_config"
+        case audioConfig = "audio_config"
+        case modelType = "model_type"
+        case vocabSize = "vocab_size"
+        case ignoreIndex = "ignore_index"
+        case imageTokenIndex = "image_token_index"
+        case audioTokenId = "audio_token_id"
+        case imageTokenId = "image_token_id"
+        case hiddenSize = "hidden_size"
+        case padTokenId = "pad_token_id"
+        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case audioSoftTokensPerImage = "audio_soft_tokens_per_image"
+        case eosTokenId = "eos_token_id"
+        case quantization
+    }
+
+    public init(from modelConfig: ModelConfig, quantization: QuantizationConfig? = nil) {
+        self.textConfig = modelConfig.textConfig
+        self.visionConfig = modelConfig.visionConfig
+        self.audioConfig = modelConfig.audioConfig
+        self.modelType = modelConfig.modelType
+        self.vocabSize = modelConfig.vocabSize
+        self.ignoreIndex = modelConfig.ignoreIndex
+        self.imageTokenIndex = modelConfig.imageTokenIndex
+        self.audioTokenId = modelConfig.audioTokenId
+        self.imageTokenId = modelConfig.imageTokenId
+        self.hiddenSize = modelConfig.hiddenSize
+        self.padTokenId = modelConfig.padTokenId
+        self.visionSoftTokensPerImage = modelConfig.visionSoftTokensPerImage
+        self.audioSoftTokensPerImage = modelConfig.audioSoftTokensPerImage
+        self.eosTokenId = modelConfig.eosTokenId
+        self.quantization = quantization
+    }
+}
+
+public class Gemma3nProcessor: UserInputProcessor {
+    private let config: Gemma3nProcessorConfiguration
+    private let tokenizer: any Tokenizer
+
+    public init(_ config: Gemma3nProcessorConfiguration, tokenizer: any Tokenizer) {
+        self.config = config
+        self.tokenizer = tokenizer
+    }
+
+    public func preprocess(images: [CIImage], processing: UserInput.Processing?) throws -> (
+        MLXArray, THW
+    ) {
+        var userProcessing = processing ?? UserInput.Processing()
+        let targetSize = CGSize(width: config.imageSize, height: config.imageSize)
+        userProcessing.resize = targetSize
+
+        let processedImages = try images.map { image in
+            let processedImage = MediaProcessing.apply(image, processing: userProcessing)
+            let srgbImage = MediaProcessing.inSRGBToneCurveSpace(processedImage)
+            let resizedImage = try MediaProcessing.resampleBicubic(srgbImage, to: targetSize)
+            let normalizedImage = MediaProcessing.normalize(
+                resizedImage, mean: config.imageMeanTuple, std: config.imageStdTuple)
+            return MediaProcessing.asMLXArray(normalizedImage)
+        }
+
+        let pixelValues = concatenated(processedImages)
+        return (pixelValues, THW(images.count, config.imageSize, config.imageSize))
+    }
+
+    public func prepare(input: UserInput) async throws -> LMInput {
+        // Create structured messages for Gemma3n using LIST_WITH_IMAGE_TYPE_TEXT format
+        var messages: [[String: Any]] = []
+
+        if !input.images.isEmpty {
+            // Add image and text content in the format expected by Gemma3n
+            let content: [[String: Any]] = [
+                ["type": "image"],
+                ["type": "text", "text": input.prompt.description],
+            ]
+            messages.append(["role": "user", "content": content])
+        } else {
+            // Text-only message
+            messages.append(["role": "user", "content": input.prompt.description])
+        }
+
+        var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
+
+        // Process images if any
+        var processedImage: LMInput.ProcessedImage?
+
+        if !input.images.isEmpty {
+            let imagePixelsAndFrames = try input.images.map {
+                try preprocess(images: [$0.asCIImage()], processing: input.processing)
+            }
+            let imagePixelsConcatenated = concatenated(imagePixelsAndFrames.map { $0.0 })
+            processedImage = LMInput.ProcessedImage(
+                pixels: imagePixelsConcatenated,
+                frames: imagePixelsAndFrames.map { $0.1 }
+            )
+
+            // Note: Unlike Gemma3, Gemma3n doesn't expand image tokens in the processor
+            // The model handles token mapping directly in get_input_embeddings
+        }
+
+        let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
+        let mask = ones(like: promptArray).asType(.int8)
+        return LMInput(
+            text: .init(tokens: promptArray, mask: mask),
+            image: processedImage
+        )
+    }
+}
+
+public struct Gemma3nProcessorConfiguration: Codable, Sendable {
+    public let processorClass: String
+    public let imageProcessorType: String?
+    public let doNormalize: Bool
+    public let doRescale: Bool
+    public let doResize: Bool
+    public let imageMean: [CGFloat]
+    public let imageStd: [CGFloat]
+    public let visionSoftTokensPerImage: Int
+    public let resample: Int
+    public let rescaleFactor: Float
+    public let size: ImageSize
+
+    // Optional fields that may be present in some configs
+    public let doConvertRgb: Bool?
+    public let doPanAndScan: Bool?
+
+    // Token identifiers - use default values that match Python implementation
+    public var imageTokenId: Int { 262145 }  // From Python: image_token_id = 262145
+    public var audioTokenId: Int { 262273 }  // From Python: audio_token_id = 262273
+
+    public struct ImageSize: Codable, Sendable {
+        public let height: Int
+        public let width: Int
+    }
+
+    // Computed properties for convenience
+    public var imageSize: Int { size.height }
+
+    public var imageMeanTuple: (CGFloat, CGFloat, CGFloat) {
+        (imageMean[0], imageMean[1], imageMean[2])
+    }
+
+    public var imageStdTuple: (CGFloat, CGFloat, CGFloat) {
+        (imageStd[0], imageStd[1], imageStd[2])
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case processorClass = "processor_class"
+        case imageProcessorType = "image_processor_type"
+        case doNormalize = "do_normalize"
+        case doRescale = "do_rescale"
+        case doResize = "do_resize"
+        case doConvertRgb = "do_convert_rgb"
+        case doPanAndScan = "do_pan_and_scan"
+        case imageMean = "image_mean"
+        case imageStd = "image_std"
+        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case resample
+        case rescaleFactor = "rescale_factor"
+        case size
+    }
+}
+
+extension Gemma3n {
+    public convenience init(_ config: Gemma3nConfiguration) {
+        let modelConfig = ModelConfig(
+            textConfig: config.textConfig,
+            visionConfig: config.visionConfig,
+            audioConfig: config.audioConfig,
+            modelType: config.modelType,
+            vocabSize: config.vocabSize,
+            ignoreIndex: config.ignoreIndex,
+            imageTokenIndex: config.imageTokenIndex,
+            audioTokenId: config.audioTokenId,
+            imageTokenId: config.imageTokenId,
+            hiddenSize: config.hiddenSize,
+            padTokenId: config.padTokenId,
+            visionSoftTokensPerImage: config.visionSoftTokensPerImage,
+            audioSoftTokensPerImage: config.audioSoftTokensPerImage,
+            eosTokenId: config.eosTokenId
+        )
+        self.init(modelConfig)
+    }
+}
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -86,6 +86,7 @@ public class VLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "qwen2_5_vl": create(Qwen25VLConfiguration.self, Qwen25VL.init),
             "idefics3": create(Idefics3Configuration.self, Idefics3.init),
             "gemma3": create(Gemma3Configuration.self, Gemma3.init),
+            "gemma3n": create(Gemma3nConfiguration.self, Gemma3n.init),
             "smolvlm": create(SmolVLM2Configuration.self, SmolVLM2.init),
         ]
     }
@@ -111,6 +112,8 @@ public class VLMProcessorTypeRegistry: ProcessorTypeRegistry, @unchecked Sendabl
                 Idefics3ProcessorConfiguration.self, Idefics3Processor.init),
             "Gemma3Processor": create(
                 Gemma3ProcessorConfiguration.self, Gemma3Processor.init),
+            "Gemma3nProcessor": create(
+                Gemma3nProcessorConfiguration.self, Gemma3nProcessor.init),
             "SmolVLMProcessor": create(
                 SmolVLMProcessorConfiguration.self, SmolVLMProcessor.init),
         ]
@@ -166,6 +169,18 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         extraEOSTokens: ["<end_of_turn>"]
     )
 
+    static public let gemma3n_E2B_instruct = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E2B-it-bf16",
+        defaultPrompt: "Describe this image.",
+        extraEOSTokens: ["<end_of_turn>"]
+    )
+
+    static public let gemma3n_E4B_instruct = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E4B-it-bf16",
+        defaultPrompt: "Describe this image.",
+        extraEOSTokens: ["<end_of_turn>"]
+    )
+
     static public let smolvlm = ModelConfiguration(
         id: "HuggingFaceTB/SmolVLM2-500M-Video-Instruct-mlx",
         defaultPrompt:
@@ -181,6 +196,8 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             gemma3_4B_qat_4bit,
             gemma3_12B_qat_4bit,
             gemma3_27B_qat_4bit,
+            gemma3n_E2B_instruct,
+            gemma3n_E4B_instruct,
             smolvlm,
         ]
     }