Add Qwen3 VL 4B config (#1992)

oelachqar · wizeng23 · web-flow · commit 6eb9b034612d · 2025-10-20T15:33:24.000Z
Co-authored-by: William Zeng &lt;10782997+wizeng23@users.noreply.github.com&gt;
diff --git a/configs/recipes/vision/qwen3_vl/README.md b/configs/recipes/vision/qwen3_vl/README.md
@@ -0,0 +1,3 @@
+# Qwen3-VL 2B
+
+Configs for Qwen3-VL models. See https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe
diff --git a/configs/recipes/vision/qwen3_vl/sft/4b_instruct_fft_train.yaml b/configs/recipes/vision/qwen3_vl/sft/4b_instruct_fft_train.yaml
@@ -0,0 +1,82 @@
+# LoRA fine-tune config for Qwen3 VL 4B Instruct.
+#
+# Requirements:
+#   - Log into WandB (`wandb login`) or disable `enable_wandb`
+#
+# Usage:
+#   oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
+#
+# See Also:
+#   - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
+#   - Config class: oumi.core.configs.TrainingConfig
+#   - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
+#   - Other training configs: configs/**/*train.yaml
+
+model:
+  model_name: "Qwen/Qwen3-VL-4B-Instruct"
+  torch_dtype_str: "bfloat16"
+  model_max_length: 4096
+  trust_remote_code: True
+  attn_implementation: "sdpa"
+  chat_template: "qwen3-vl-instruct"
+  freeze_layers:
+    - "visual"
+
+data:
+  train:
+    collator_name: "vision_language_with_padding"
+    use_torchdata: True
+    datasets:
+      - dataset_name: "merve/vqav2-small"
+        split: "validation"
+        shuffle: True
+        seed: 42
+        transform_num_workers: "auto"
+        dataset_kwargs:
+          processor_name: "Qwen/Qwen3-VL-4B-Instruct"
+          return_tensors: True
+          # limit: 4096 # Uncomment to limit dataset size!
+          # return_conversations: True
+
+      # Below are examples of other vision SFT datasets
+      # - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
+      #   split: "train"
+      #   shuffle: True
+      #   seed: 42
+      #   transform_num_workers: "auto"
+      #   dataset_kwargs:
+      #     processor_name: "Qwen/Qwen3-VL-4B-Instruct"
+      #     return_tensors: True
+
+training:
+  output_dir: "output/vlm_finetuned"
+  trainer_type: "TRL_SFT" # or "OUMI"
+  enable_gradient_checkpointing: True
+  per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
+  gradient_accumulation_steps: 32
+  max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
+  # num_train_epochs: 1
+  use_peft: False
+
+  gradient_checkpointing_kwargs:
+    # Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
+    use_reentrant: False
+  max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
+  ddp_find_unused_parameters: False
+  empty_device_cache_steps: 1
+  compile: False
+
+  optimizer: "adamw_torch_fused"
+  learning_rate: 2e-5
+  warmup_ratio: 0.03
+  weight_decay: 0.0
+  lr_scheduler_type: "cosine"
+
+  logging_steps: 5
+  save_steps: 0
+  dataloader_main_process_only: False
+  dataloader_num_workers: 2
+  dataloader_prefetch_factor: 8
+  include_performance_metrics: True
+  log_model_summary: False
+  enable_wandb: True
diff --git a/configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml b/configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
@@ -0,0 +1,96 @@
+# LoRA fine-tune config for Qwen3 VL 4B Instruct.
+#
+# Requirements:
+#   - Log into WandB (`wandb login`) or disable `enable_wandb`
+#
+# Usage:
+#   oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
+#
+# See Also:
+#   - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
+#   - Config class: oumi.core.configs.TrainingConfig
+#   - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
+#   - Other training configs: configs/**/*train.yaml
+
+model:
+  model_name: "Qwen/Qwen3-VL-4B-Instruct"
+  torch_dtype_str: "bfloat16"
+  model_max_length: 4096
+  trust_remote_code: True
+  # TODO: Enable flash attention
+  attn_implementation: "sdpa"
+  chat_template: "qwen3-vl-instruct"
+  freeze_layers:
+    - "visual"
+
+data:
+  train:
+    collator_name: "vision_language_with_padding"
+    use_torchdata: True
+    datasets:
+      - dataset_name: "merve/vqav2-small"
+        split: "validation"
+        shuffle: True
+        seed: 42
+        transform_num_workers: "auto"
+        dataset_kwargs:
+          processor_name: "Qwen/Qwen3-VL-4B-Instruct"
+          return_tensors: True
+          # limit: 4096 # Uncomment to limit dataset size!
+          # return_conversations: True
+
+      # Below are examples of other vision SFT datasets
+      # - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
+      #   split: "train"
+      #   shuffle: True
+      #   seed: 42
+      #   transform_num_workers: "auto"
+      #   dataset_kwargs:
+      #     processor_name: "Qwen/Qwen3-VL-4B-Instruct"
+      #     return_tensors: True
+
+training:
+  output_dir: "output/vlm_finetuned"
+  trainer_type: "TRL_SFT" # or "OUMI"
+  enable_gradient_checkpointing: True
+  per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
+  gradient_accumulation_steps: 32
+  max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
+  # num_train_epochs: 1
+  use_peft: True
+
+  gradient_checkpointing_kwargs:
+    # Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
+    use_reentrant: False
+  max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
+  ddp_find_unused_parameters: False
+  empty_device_cache_steps: 1
+  compile: False
+
+  optimizer: "adamw_torch_fused"
+  learning_rate: 2e-5
+  warmup_ratio: 0.03
+  weight_decay: 0.0
+  lr_scheduler_type: "cosine"
+
+  logging_steps: 5
+  save_steps: 0
+  dataloader_main_process_only: False
+  dataloader_num_workers: 2
+  dataloader_prefetch_factor: 8
+  include_performance_metrics: True
+  log_model_summary: False
+  enable_wandb: True
+
+peft:
+  lora_r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  lora_target_modules:
+    - "q_proj"
+    - "v_proj"
+    - "o_proj"
+    - "k_proj"
+    - "gate_proj"
+    - "up_proj"
+    - "down_proj"
diff --git a/docs/resources/models/models.md b/docs/resources/models/models.md
@@ -145,6 +145,7 @@ Available templates include:
 - `llava` - For LLaVA multimodal models
 - `phi3-instruct` - For Phi-3 instruction models
 - `qwen2-vl-instruct` - For Qwen2-VL instruction models
+- `qwen3-vl-instruct` - For Qwen3-VL instruction models
 - `zephyr` - For Zephyr models
 
 All the templates expect a `messages` list, where each message is a dictionary with `role` and `content` keys in {doc}`oumi format </resources/datasets/data_formats>`.
diff --git a/src/oumi/core/configs/internal/supported_models.py b/src/oumi/core/configs/internal/supported_models.py
@@ -292,6 +292,38 @@ def _create_qwen2_5_vl_vlm_config() -> InternalModelConfig:
     return config
 
 
+def _create_qwen3_vl_vlm_config() -> InternalModelConfig:
+    config = _create_default_vlm_config(
+        pixel_values_variable_shape=True,
+        supports_multiple_images=True,
+    )
+    config.chat_template = "qwen3-vl-instruct"
+    # FIXME OPE-946 Consider updating to "right":
+    # config.padding_side = InternalPaddingSide.PAD_RIGHT
+    config.model_input_features.update(
+        {
+            feature_name: InternalFeatureSpec(
+                name=feature_name,
+                required=True,
+                variable_shape=False,
+                image_dependent=True,
+            )
+            for feature_name in ("image_grid_thw",)
+        }
+    )
+    config.processor_kwargs.update(
+        # Defaults per Qwen3-VL:
+        # https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+        {
+            "min_pixels": 4 * 28 * 28,
+            "max_pixels": 16384 * 28 * 28,
+            "patch_size": 16,
+        }
+    )
+
+    return config
+
+
 def _create_phi3_vlm_config() -> InternalModelConfig:
     config = _create_default_vlm_config(
         pixel_values_variable_shape=True,
@@ -531,6 +563,12 @@ def get_all_models_map() -> Mapping[
             tested=True,
             config=_create_qwen2_5_vl_vlm_config(),
         ),
+        _ModelTypeInfo(
+            model_type="qwen3_vl",
+            model_class=default_vlm_class,
+            tested=True,
+            config=_create_qwen3_vl_vlm_config(),
+        ),
         _ModelTypeInfo(
             model_type="vipllava",
             model_class=default_vlm_class,
diff --git a/src/oumi/datasets/chat_templates/qwen3-vl-instruct.jinja b/src/oumi/datasets/chat_templates/qwen3-vl-instruct.jinja
@@ -0,0 +1,37 @@
+{% set image_count = namespace(value=0) %}
+{% set video_count = namespace(value=0) %}
+
+{%- for message in messages -%}
+    {%- if loop.first and message['role'] != 'system' -%}
+        {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
+    {%- endif -%}
+
+    {{ '<|im_start|>' + message['role'] + '\n' }}
+
+    {%- if message['content'] is string -%}
+        {{- message['content'] -}}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'].startswith('image') -%}
+                {%- set image_count.value = image_count.value + 1 -%}
+                {%- if add_vision_id -%}
+                    {{ 'Picture ' + image_count.value + ': ' }}
+                {%- endif -%}
+                {{ '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif item['type'].startswith('video') -%}
+                {%- set video_count.value = video_count.value + 1 -%}
+                {%- if add_vision_id -%}
+                    {{ 'Video ' + video_count.value + ': ' }}
+                {%- endif -%}
+                {{ '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif item['type']=='text' -%}
+                {{- item['text'] if 'text' in item else item['content'] -}}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {{ '<|im_end|>\n' }}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- '<|im_start|>assistant\n' -}}
+{%- endif -%}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Qwen3-VL 2B`
	`2`	`+`
	`3`	`+Configs for Qwen3-VL models. See https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe`