Skip to content

Commit 6eb9b03

Browse files
oelachqarwizeng23
andauthored
Add Qwen3 VL 4B config (#1992)
Co-authored-by: William Zeng <10782997+wizeng23@users.noreply.github.com>
1 parent 60f517f commit 6eb9b03

File tree

6 files changed

+257
-0
lines changed

6 files changed

+257
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Qwen3-VL 2B
2+
3+
Configs for Qwen3-VL models. See https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# LoRA fine-tune config for Qwen3 VL 4B Instruct.
2+
#
3+
# Requirements:
4+
# - Log into WandB (`wandb login`) or disable `enable_wandb`
5+
#
6+
# Usage:
7+
# oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
8+
#
9+
# See Also:
10+
# - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
11+
# - Config class: oumi.core.configs.TrainingConfig
12+
# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
13+
# - Other training configs: configs/**/*train.yaml
14+
15+
model:
16+
model_name: "Qwen/Qwen3-VL-4B-Instruct"
17+
torch_dtype_str: "bfloat16"
18+
model_max_length: 4096
19+
trust_remote_code: True
20+
attn_implementation: "sdpa"
21+
chat_template: "qwen3-vl-instruct"
22+
freeze_layers:
23+
- "visual"
24+
25+
data:
26+
train:
27+
collator_name: "vision_language_with_padding"
28+
use_torchdata: True
29+
datasets:
30+
- dataset_name: "merve/vqav2-small"
31+
split: "validation"
32+
shuffle: True
33+
seed: 42
34+
transform_num_workers: "auto"
35+
dataset_kwargs:
36+
processor_name: "Qwen/Qwen3-VL-4B-Instruct"
37+
return_tensors: True
38+
# limit: 4096 # Uncomment to limit dataset size!
39+
# return_conversations: True
40+
41+
# Below are examples of other vision SFT datasets
42+
# - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
43+
# split: "train"
44+
# shuffle: True
45+
# seed: 42
46+
# transform_num_workers: "auto"
47+
# dataset_kwargs:
48+
# processor_name: "Qwen/Qwen3-VL-4B-Instruct"
49+
# return_tensors: True
50+
51+
training:
52+
output_dir: "output/vlm_finetuned"
53+
trainer_type: "TRL_SFT" # or "OUMI"
54+
enable_gradient_checkpointing: True
55+
per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
56+
gradient_accumulation_steps: 32
57+
max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
58+
# num_train_epochs: 1
59+
use_peft: False
60+
61+
gradient_checkpointing_kwargs:
62+
# Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
63+
use_reentrant: False
64+
max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
65+
ddp_find_unused_parameters: False
66+
empty_device_cache_steps: 1
67+
compile: False
68+
69+
optimizer: "adamw_torch_fused"
70+
learning_rate: 2e-5
71+
warmup_ratio: 0.03
72+
weight_decay: 0.0
73+
lr_scheduler_type: "cosine"
74+
75+
logging_steps: 5
76+
save_steps: 0
77+
dataloader_main_process_only: False
78+
dataloader_num_workers: 2
79+
dataloader_prefetch_factor: 8
80+
include_performance_metrics: True
81+
log_model_summary: False
82+
enable_wandb: True
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# LoRA fine-tune config for Qwen3 VL 4B Instruct.
2+
#
3+
# Requirements:
4+
# - Log into WandB (`wandb login`) or disable `enable_wandb`
5+
#
6+
# Usage:
7+
# oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
8+
#
9+
# See Also:
10+
# - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
11+
# - Config class: oumi.core.configs.TrainingConfig
12+
# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
13+
# - Other training configs: configs/**/*train.yaml
14+
15+
model:
16+
model_name: "Qwen/Qwen3-VL-4B-Instruct"
17+
torch_dtype_str: "bfloat16"
18+
model_max_length: 4096
19+
trust_remote_code: True
20+
# TODO: Enable flash attention
21+
attn_implementation: "sdpa"
22+
chat_template: "qwen3-vl-instruct"
23+
freeze_layers:
24+
- "visual"
25+
26+
data:
27+
train:
28+
collator_name: "vision_language_with_padding"
29+
use_torchdata: True
30+
datasets:
31+
- dataset_name: "merve/vqav2-small"
32+
split: "validation"
33+
shuffle: True
34+
seed: 42
35+
transform_num_workers: "auto"
36+
dataset_kwargs:
37+
processor_name: "Qwen/Qwen3-VL-4B-Instruct"
38+
return_tensors: True
39+
# limit: 4096 # Uncomment to limit dataset size!
40+
# return_conversations: True
41+
42+
# Below are examples of other vision SFT datasets
43+
# - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
44+
# split: "train"
45+
# shuffle: True
46+
# seed: 42
47+
# transform_num_workers: "auto"
48+
# dataset_kwargs:
49+
# processor_name: "Qwen/Qwen3-VL-4B-Instruct"
50+
# return_tensors: True
51+
52+
training:
53+
output_dir: "output/vlm_finetuned"
54+
trainer_type: "TRL_SFT" # or "OUMI"
55+
enable_gradient_checkpointing: True
56+
per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
57+
gradient_accumulation_steps: 32
58+
max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
59+
# num_train_epochs: 1
60+
use_peft: True
61+
62+
gradient_checkpointing_kwargs:
63+
# Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
64+
use_reentrant: False
65+
max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
66+
ddp_find_unused_parameters: False
67+
empty_device_cache_steps: 1
68+
compile: False
69+
70+
optimizer: "adamw_torch_fused"
71+
learning_rate: 2e-5
72+
warmup_ratio: 0.03
73+
weight_decay: 0.0
74+
lr_scheduler_type: "cosine"
75+
76+
logging_steps: 5
77+
save_steps: 0
78+
dataloader_main_process_only: False
79+
dataloader_num_workers: 2
80+
dataloader_prefetch_factor: 8
81+
include_performance_metrics: True
82+
log_model_summary: False
83+
enable_wandb: True
84+
85+
peft:
86+
lora_r: 8
87+
lora_alpha: 16
88+
lora_dropout: 0.05
89+
lora_target_modules:
90+
- "q_proj"
91+
- "v_proj"
92+
- "o_proj"
93+
- "k_proj"
94+
- "gate_proj"
95+
- "up_proj"
96+
- "down_proj"

docs/resources/models/models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ Available templates include:
145145
- `llava` - For LLaVA multimodal models
146146
- `phi3-instruct` - For Phi-3 instruction models
147147
- `qwen2-vl-instruct` - For Qwen2-VL instruction models
148+
- `qwen3-vl-instruct` - For Qwen3-VL instruction models
148149
- `zephyr` - For Zephyr models
149150

150151
All the templates expect a `messages` list, where each message is a dictionary with `role` and `content` keys in {doc}`oumi format </resources/datasets/data_formats>`.

src/oumi/core/configs/internal/supported_models.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,38 @@ def _create_qwen2_5_vl_vlm_config() -> InternalModelConfig:
292292
return config
293293

294294

295+
def _create_qwen3_vl_vlm_config() -> InternalModelConfig:
296+
config = _create_default_vlm_config(
297+
pixel_values_variable_shape=True,
298+
supports_multiple_images=True,
299+
)
300+
config.chat_template = "qwen3-vl-instruct"
301+
# FIXME OPE-946 Consider updating to "right":
302+
# config.padding_side = InternalPaddingSide.PAD_RIGHT
303+
config.model_input_features.update(
304+
{
305+
feature_name: InternalFeatureSpec(
306+
name=feature_name,
307+
required=True,
308+
variable_shape=False,
309+
image_dependent=True,
310+
)
311+
for feature_name in ("image_grid_thw",)
312+
}
313+
)
314+
config.processor_kwargs.update(
315+
# Defaults per Qwen3-VL:
316+
# https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
317+
{
318+
"min_pixels": 4 * 28 * 28,
319+
"max_pixels": 16384 * 28 * 28,
320+
"patch_size": 16,
321+
}
322+
)
323+
324+
return config
325+
326+
295327
def _create_phi3_vlm_config() -> InternalModelConfig:
296328
config = _create_default_vlm_config(
297329
pixel_values_variable_shape=True,
@@ -531,6 +563,12 @@ def get_all_models_map() -> Mapping[
531563
tested=True,
532564
config=_create_qwen2_5_vl_vlm_config(),
533565
),
566+
_ModelTypeInfo(
567+
model_type="qwen3_vl",
568+
model_class=default_vlm_class,
569+
tested=True,
570+
config=_create_qwen3_vl_vlm_config(),
571+
),
534572
_ModelTypeInfo(
535573
model_type="vipllava",
536574
model_class=default_vlm_class,
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{% set image_count = namespace(value=0) %}
2+
{% set video_count = namespace(value=0) %}
3+
4+
{%- for message in messages -%}
5+
{%- if loop.first and message['role'] != 'system' -%}
6+
{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
7+
{%- endif -%}
8+
9+
{{ '<|im_start|>' + message['role'] + '\n' }}
10+
11+
{%- if message['content'] is string -%}
12+
{{- message['content'] -}}
13+
{%- elif message['content'] is iterable -%}
14+
{%- for item in message['content'] -%}
15+
{%- if item['type'].startswith('image') -%}
16+
{%- set image_count.value = image_count.value + 1 -%}
17+
{%- if add_vision_id -%}
18+
{{ 'Picture ' + image_count.value + ': ' }}
19+
{%- endif -%}
20+
{{ '<|vision_start|><|image_pad|><|vision_end|>' }}
21+
{%- elif item['type'].startswith('video') -%}
22+
{%- set video_count.value = video_count.value + 1 -%}
23+
{%- if add_vision_id -%}
24+
{{ 'Video ' + video_count.value + ': ' }}
25+
{%- endif -%}
26+
{{ '<|vision_start|><|video_pad|><|vision_end|>' }}
27+
{%- elif item['type']=='text' -%}
28+
{{- item['text'] if 'text' in item else item['content'] -}}
29+
{%- endif -%}
30+
{%- endfor -%}
31+
{%- endif -%}
32+
{{ '<|im_end|>\n' }}
33+
{%- endfor -%}
34+
35+
{%- if add_generation_prompt -%}
36+
{{- '<|im_start|>assistant\n' -}}
37+
{%- endif -%}

0 commit comments

Comments
 (0)