@@ -131,15 +131,14 @@ private enum Language {
131
131
}
132
132
133
133
fileprivate class MLP : Module , UnaryLayer {
134
-
135
134
@ModuleInfo ( key: " gate_proj " ) var gate : Linear
136
- @ModuleInfo ( key: " down_proj " ) var down : Linear
137
135
@ModuleInfo ( key: " up_proj " ) var up : Linear
136
+ @ModuleInfo ( key: " down_proj " ) var down : Linear
138
137
139
138
public init ( dimensions: Int , hiddenDimensions: Int ) {
140
- self . _gate. wrappedValue = Linear ( dimensions, hiddenDimensions, bias : false )
141
- self . _down . wrappedValue = Linear ( hiddenDimensions , dimensions, bias : false )
142
- self . _up . wrappedValue = Linear ( dimensions , hiddenDimensions, bias : false )
139
+ self . _gate. wrappedValue = Linear ( dimensions, hiddenDimensions)
140
+ self . _up . wrappedValue = Linear ( dimensions, hiddenDimensions )
141
+ self . _down . wrappedValue = Linear ( hiddenDimensions, dimensions )
143
142
}
144
143
145
144
public func callAsFunction( _ x: MLXArray ) -> MLXArray {
@@ -1069,14 +1068,18 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1069
1068
public let vocabularySize : Int
1070
1069
public let kvHeads : Int
1071
1070
private let _maxPositionEmbeddings : Int ?
1072
- public var maxpPositionEmbeddings : Int { _maxPositionEmbeddings ?? 128000 }
1071
+ public var maxPositionEmbeddings : Int { _maxPositionEmbeddings ?? 128000 }
1073
1072
private let _ropeTheta : Float ?
1074
1073
public var ropeTheta : Float { _ropeTheta ?? 1_000_000 }
1075
1074
private let _ropeTraditional : Bool ?
1076
1075
public var ropeTraditional : Bool { _ropeTraditional ?? false }
1077
1076
public let ropeScaling : [ String : StringOrNumber ] ?
1078
1077
private let _tieWordEmbeddings : Bool ?
1079
1078
public var tieWordEmbeddings : Bool { _tieWordEmbeddings ?? true }
1079
+ private let _slidingWindow : Int ?
1080
+ public var slidingWindow : Int { _slidingWindow ?? 32768 }
1081
+ private let _useSlidingWindow : Bool ?
1082
+ public var useSlidingWindow : Bool { _useSlidingWindow ?? false }
1080
1083
1081
1084
enum CodingKeys : String , CodingKey {
1082
1085
case modelType = " model_type "
@@ -1092,6 +1095,8 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1092
1095
case _ropeTraditional = " rope_traditional "
1093
1096
case ropeScaling = " rope_scaling "
1094
1097
case _tieWordEmbeddings = " tie_word_embeddings "
1098
+ case _slidingWindow = " sliding_window "
1099
+ case _useSlidingWindow = " use_sliding_window "
1095
1100
}
1096
1101
}
1097
1102
@@ -1102,16 +1107,20 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1102
1107
public let outHiddenSize : Int
1103
1108
public let numHeads : Int
1104
1109
public let patchSize : Int
1105
- public let mlpRatio : Float
1106
- public let _inChannels : Int ?
1107
- public var inChannels : Int { _inChannels ?? 3 }
1108
- public let _layerNormEps : Float ?
1110
+ private let _inChans : Int ?
1111
+ public var inChannels : Int { _inChans ?? 3 }
1112
+ private let _layerNormEps : Float ?
1109
1113
public var layerNormEps : Float { _layerNormEps ?? 1e-6 }
1110
1114
public let spatialPatchSize : Int
1111
1115
public let spatialMergeSize : Int
1112
1116
public let temporalPatchSize : Int
1113
1117
public let windowSize : Int
1114
1118
public let fullattBlockIndexes : [ Int ]
1119
+ public let tokensPerSecond : Int
1120
+ private let _skipVision : Bool ?
1121
+ public var skipVision : Bool { _skipVision ?? false }
1122
+ private let _hiddenAct : String ?
1123
+ public var hiddenAct : String { _hiddenAct ?? " silu " }
1115
1124
1116
1125
enum CodingKeys : String , CodingKey {
1117
1126
case depth
@@ -1120,14 +1129,16 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1120
1129
case outHiddenSize = " out_hidden_size "
1121
1130
case numHeads = " num_heads "
1122
1131
case patchSize = " patch_size "
1123
- case mlpRatio = " mlp_ratio "
1124
- case _inChannels = " in_channels "
1125
- case _layerNormEps = " layer_norm_eps "
1132
+ case _inChans = " in_chans "
1133
+ case _layerNormEps = " layer_norm_eps " // Added this line
1126
1134
case spatialPatchSize = " spatial_patch_size "
1127
1135
case spatialMergeSize = " spatial_merge_size "
1128
1136
case temporalPatchSize = " temporal_patch_size "
1129
1137
case windowSize = " window_size "
1130
1138
case fullattBlockIndexes = " fullatt_block_indexes "
1139
+ case tokensPerSecond = " tokens_per_second "
1140
+ case _skipVision = " skip_vision "
1141
+ case _hiddenAct = " hidden_act "
1131
1142
}
1132
1143
}
1133
1144
@@ -1140,6 +1151,13 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1140
1151
public let visionEndTokenId : Int
1141
1152
public let visionTokenId : Int
1142
1153
public let hiddenSize : Int
1154
+ public let numAttentionHeads : Int
1155
+ public let numHiddenLayers : Int
1156
+ public let intermediateSize : Int
1157
+ public let numKeyValueHeads : Int
1158
+ public let slidingWindow : Int
1159
+ public let useSlidingWindow : Bool
1160
+ public let maxWindowLayers : Int
1143
1161
1144
1162
enum CodingKeys : String , CodingKey {
1145
1163
case modelType = " model_type "
@@ -1150,6 +1168,13 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1150
1168
case visionEndTokenId = " vision_end_token_id "
1151
1169
case visionTokenId = " vision_token_id "
1152
1170
case hiddenSize = " hidden_size "
1171
+ case numAttentionHeads = " num_attention_heads "
1172
+ case numHiddenLayers = " num_hidden_layers "
1173
+ case intermediateSize = " intermediate_size "
1174
+ case numKeyValueHeads = " num_key_value_heads "
1175
+ case slidingWindow = " sliding_window "
1176
+ case useSlidingWindow = " use_sliding_window "
1177
+ case maxWindowLayers = " max_window_layers "
1153
1178
}
1154
1179
}
1155
1180
@@ -1176,7 +1201,6 @@ public struct Qwen25VLConfiguration: Codable, Sendable {
1176
1201
1177
1202
/// Configuration for ``Qwen25VLProcessor``
1178
1203
public struct Qwen25 VLProcessorConfiguration: Codable, Sendable {
1179
-
1180
1204
public struct Size: Codable , Sendable {
1181
1205
public let maxPixels: Int
1182
1206
public let minPixels : Int
@@ -1189,10 +1213,12 @@ public struct Qwen25VLProcessorConfiguration: Codable, Sendable {
1189
1213
1190
1214
public let imageMean : [ CGFloat ]
1191
1215
public let imageStd : [ CGFloat ]
1192
- public let size : Size
1216
+ public let minPixels : Int
1217
+ public let maxPixels : Int
1193
1218
public let mergeSize : Int
1194
1219
public let patchSize : Int
1195
1220
public let temporalPatchSize : Int
1221
+ public let imageProcessorType : String
1196
1222
1197
1223
public var imageMeanTuple : ( CGFloat , CGFloat , CGFloat ) {
1198
1224
( imageMean [ 0 ] , imageMean [ 1 ] , imageMean [ 2 ] )
@@ -1201,12 +1227,18 @@ public struct Qwen25VLProcessorConfiguration: Codable, Sendable {
1201
1227
( imageStd [ 0 ] , imageStd [ 1 ] , imageStd [ 2 ] )
1202
1228
}
1203
1229
1230
+ public var size : Size {
1231
+ Size ( maxPixels: maxPixels, minPixels: minPixels)
1232
+ }
1233
+
1204
1234
enum CodingKeys : String , CodingKey {
1205
1235
case imageMean = " image_mean "
1206
1236
case imageStd = " image_std "
1207
- case size
1237
+ case minPixels = " min_pixels "
1238
+ case maxPixels = " max_pixels "
1208
1239
case mergeSize = " merge_size "
1209
1240
case patchSize = " patch_size "
1210
1241
case temporalPatchSize = " temporal_patch_size "
1242
+ case imageProcessorType = " image_processor_type "
1211
1243
}
1212
1244
}
0 commit comments