-
-
Notifications
You must be signed in to change notification settings - Fork 791
Open
Description
System Info
Google Colab
80GB A100 GPU Linux
transformers-4.57.0.dev0
bitsandbytes 0.48.1
Reproduction
It was very weird that I tried to finetune Qwen3-VL-30B-A3B on Google Colab, and found that using 4-bit version actually raised VRAM OOM error, and if I use bf16 it runs fine.
If I added quantization_config=bnb_config it will have OOM
from transformers import BitsAndBytesConfig
import torch
from transformers import AutoModelForImageTextToText, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4',
)
# processor = AutoProcessor.from_pretrained("google/gemma-3-270m")
model = AutoModelForImageTextToText.from_pretrained(
"Qwen/Qwen3-VL-30B-A3B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"#, quantization_config=bnb_config
)
from peft import LoraConfig, get_peft_model
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.0,
r=16,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
]
)
from trl import SFTConfig
training_args = SFTConfig(
output_dir="qwen_anti_aesthetics_3b",
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
gradient_checkpointing_kwargs={"use_reentrant": False},
max_length=None,
optim="adamw_torch_fused",
learning_rate=2e-5,
weight_decay=0.001,
logging_steps=10,
eval_steps=500,
logging_strategy="steps",
eval_strategy="steps",
save_strategy="steps",
save_steps=500,
bf16=True,
warmup_ratio=0.02,
push_to_hub=True,
report_to="wandb",
remove_unused_columns=False,
dataloader_num_workers=12,
dataloader_prefetch_factor=4,
dataloader_pin_memory=True,
completion_only_loss=True,
lr_scheduler_type="cosine",
)
trainer = SFTTrainer(
model=model,
peft_config=peft_config,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
)Expected behavior
4bit version should use less, at least not more RAM, than regular model
Metadata
Metadata
Assignees
Labels
No labels