Skip to content

Commit d3b8740

Browse files
committed
Iterating
1 parent cb10fc0 commit d3b8740

File tree

2 files changed

+49
-20
lines changed

2 files changed

+49
-20
lines changed

Libraries/MLXVLM/Models/Qwen25VL.swift

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,14 @@ private enum Language {
131131
}
132132

133133
fileprivate class MLP: Module, UnaryLayer {
134-
135134
@ModuleInfo(key: "gate_proj") var gate: Linear
136-
@ModuleInfo(key: "down_proj") var down: Linear
137135
@ModuleInfo(key: "up_proj") var up: Linear
136+
@ModuleInfo(key: "down_proj") var down: Linear
138137

139138
public init(dimensions: Int, hiddenDimensions: Int) {
140-
self._gate.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
141-
self._down.wrappedValue = Linear(hiddenDimensions, dimensions, bias: false)
142-
self._up.wrappedValue = Linear(dimensions, hiddenDimensions, bias: false)
139+
self._gate.wrappedValue = Linear(dimensions, hiddenDimensions)
140+
self._up.wrappedValue = Linear(dimensions, hiddenDimensions)
141+
self._down.wrappedValue = Linear(hiddenDimensions, dimensions)
143142
}
144143

145144
public func callAsFunction(_ x: MLXArray) -> MLXArray {
@@ -1069,14 +1068,18 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
10691068
public let vocabularySize: Int
10701069
public let kvHeads: Int
10711070
private let _maxPositionEmbeddings: Int?
1072-
public var maxpPositionEmbeddings: Int { _maxPositionEmbeddings ?? 128000 }
1071+
public var maxPositionEmbeddings: Int { _maxPositionEmbeddings ?? 128000 }
10731072
private let _ropeTheta: Float?
10741073
public var ropeTheta: Float { _ropeTheta ?? 1_000_000 }
10751074
private let _ropeTraditional: Bool?
10761075
public var ropeTraditional: Bool { _ropeTraditional ?? false }
10771076
public let ropeScaling: [String: StringOrNumber]?
10781077
private let _tieWordEmbeddings: Bool?
10791078
public var tieWordEmbeddings: Bool { _tieWordEmbeddings ?? true }
1079+
private let _slidingWindow: Int?
1080+
public var slidingWindow: Int { _slidingWindow ?? 32768 }
1081+
private let _useSlidingWindow: Bool?
1082+
public var useSlidingWindow: Bool { _useSlidingWindow ?? false }
10801083

10811084
enum CodingKeys: String, CodingKey {
10821085
case modelType = "model_type"
@@ -1092,6 +1095,8 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
10921095
case _ropeTraditional = "rope_traditional"
10931096
case ropeScaling = "rope_scaling"
10941097
case _tieWordEmbeddings = "tie_word_embeddings"
1098+
case _slidingWindow = "sliding_window"
1099+
case _useSlidingWindow = "use_sliding_window"
10951100
}
10961101
}
10971102

@@ -1102,16 +1107,20 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
11021107
public let outHiddenSize: Int
11031108
public let numHeads: Int
11041109
public let patchSize: Int
1105-
public let mlpRatio: Float
1106-
public let _inChannels: Int?
1107-
public var inChannels: Int { _inChannels ?? 3 }
1108-
public let _layerNormEps: Float?
1110+
private let _inChans: Int?
1111+
public var inChannels: Int { _inChans ?? 3 }
1112+
private let _layerNormEps: Float?
11091113
public var layerNormEps: Float { _layerNormEps ?? 1e-6 }
11101114
public let spatialPatchSize: Int
11111115
public let spatialMergeSize: Int
11121116
public let temporalPatchSize: Int
11131117
public let windowSize: Int
11141118
public let fullattBlockIndexes: [Int]
1119+
public let tokensPerSecond: Int
1120+
private let _skipVision: Bool?
1121+
public var skipVision: Bool { _skipVision ?? false }
1122+
private let _hiddenAct: String?
1123+
public var hiddenAct: String { _hiddenAct ?? "silu" }
11151124

11161125
enum CodingKeys: String, CodingKey {
11171126
case depth
@@ -1120,14 +1129,16 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
11201129
case outHiddenSize = "out_hidden_size"
11211130
case numHeads = "num_heads"
11221131
case patchSize = "patch_size"
1123-
case mlpRatio = "mlp_ratio"
1124-
case _inChannels = "in_channels"
1125-
case _layerNormEps = "layer_norm_eps"
1132+
case _inChans = "in_chans"
1133+
case _layerNormEps = "layer_norm_eps" // Added this line
11261134
case spatialPatchSize = "spatial_patch_size"
11271135
case spatialMergeSize = "spatial_merge_size"
11281136
case temporalPatchSize = "temporal_patch_size"
11291137
case windowSize = "window_size"
11301138
case fullattBlockIndexes = "fullatt_block_indexes"
1139+
case tokensPerSecond = "tokens_per_second"
1140+
case _skipVision = "skip_vision"
1141+
case _hiddenAct = "hidden_act"
11311142
}
11321143
}
11331144

@@ -1140,6 +1151,13 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
11401151
public let visionEndTokenId: Int
11411152
public let visionTokenId: Int
11421153
public let hiddenSize: Int
1154+
public let numAttentionHeads: Int
1155+
public let numHiddenLayers: Int
1156+
public let intermediateSize: Int
1157+
public let numKeyValueHeads: Int
1158+
public let slidingWindow: Int
1159+
public let useSlidingWindow: Bool
1160+
public let maxWindowLayers: Int
11431161

11441162
enum CodingKeys: String, CodingKey {
11451163
case modelType = "model_type"
@@ -1150,6 +1168,13 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
11501168
case visionEndTokenId = "vision_end_token_id"
11511169
case visionTokenId = "vision_token_id"
11521170
case hiddenSize = "hidden_size"
1171+
case numAttentionHeads = "num_attention_heads"
1172+
case numHiddenLayers = "num_hidden_layers"
1173+
case intermediateSize = "intermediate_size"
1174+
case numKeyValueHeads = "num_key_value_heads"
1175+
case slidingWindow = "sliding_window"
1176+
case useSlidingWindow = "use_sliding_window"
1177+
case maxWindowLayers = "max_window_layers"
11531178
}
11541179
}
11551180

@@ -1176,7 +1201,6 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
11761201

11771202
/// Configuration for ``Qwen25VLProcessor``
11781203
public struct Qwen25VLProcessorConfiguration: Codable, Sendable {
1179-
11801204
public struct Size: Codable, Sendable {
11811205
public let maxPixels: Int
11821206
public let minPixels: Int
@@ -1189,10 +1213,12 @@ public struct Qwen25VLProcessorConfiguration: Codable, Sendable {
11891213

11901214
public let imageMean: [CGFloat]
11911215
public let imageStd: [CGFloat]
1192-
public let size: Size
1216+
public let minPixels: Int
1217+
public let maxPixels: Int
11931218
public let mergeSize: Int
11941219
public let patchSize: Int
11951220
public let temporalPatchSize: Int
1221+
public let imageProcessorType: String
11961222

11971223
public var imageMeanTuple: (CGFloat, CGFloat, CGFloat) {
11981224
(imageMean[0], imageMean[1], imageMean[2])
@@ -1201,12 +1227,18 @@ public struct Qwen25VLProcessorConfiguration: Codable, Sendable {
12011227
(imageStd[0], imageStd[1], imageStd[2])
12021228
}
12031229

1230+
public var size: Size {
1231+
Size(maxPixels: maxPixels, minPixels: minPixels)
1232+
}
1233+
12041234
enum CodingKeys: String, CodingKey {
12051235
case imageMean = "image_mean"
12061236
case imageStd = "image_std"
1207-
case size
1237+
case minPixels = "min_pixels"
1238+
case maxPixels = "max_pixels"
12081239
case mergeSize = "merge_size"
12091240
case patchSize = "patch_size"
12101241
case temporalPatchSize = "temporal_patch_size"
1242+
case imageProcessorType = "image_processor_type"
12111243
}
12121244
}

Libraries/MLXVLM/VLMModelFactory.swift

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,7 @@ public class ProcessorTypeRegistry: @unchecked Sendable {
101101
PaliGemmaProcessorConfiguration.self, PaliGemmaProcessor.init),
102102
"Qwen2VLProcessor": create(
103103
Qwen2VLProcessorConfiguration.self, Qwen2VLProcessor.init),
104-
// Error: Generic parameter 'C' could not be inferred
105-
// Error: Cannot find 'Qwen25VLProcessor' in scope
106-
// Error: Cannot find 'Qwen25VLProcessorConfiguration' in scope
107-
"Qwen25VLProcessor": create(
104+
"Qwen2_5_VLProcessor": create(
108105
Qwen25VLProcessorConfiguration.self, Qwen25VLProcessor.init),
109106
"Idefics3Processor": create(
110107
Idefics3ProcessorConfiguration.self, Idefics3Processor.init),

0 commit comments

Comments
 (0)