Skip to content

Incremental Prefilling error #131

@UchihaIchigo

Description

@UchihaIchigo

Three 4090 servers have 72G memory, I want to run the code of Incremental Prefilling on multiple cards on this server, how should I achieve it, the official documentation says that 40g can run small, I have tried many times and it has prompted that the size of a single graphics card is not enough

import torch
from transformers import AutoModelForCausalLM

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images

specify the path to the model

model_path = "deepseek-ai/deepseek-vl2-small"
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

multiple images/interleaved image-text

conversation = [
{
"role": "<|User|>",
"content": "This is image_1: \n"
"This is image_2: \n"
"This is image_3: \n Can you tell me what are in the images?",
"images": [
"images/multi_image_1.jpeg",
"images/multi_image_2.jpeg",
"images/multi_image_3.jpeg",
],
},
{"role": "<|Assistant|>", "content": ""}
]

load images and prepare for inputs

pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=pil_images,
force_batchify=True,
system_prompt=""
).to(vl_gpt.device)

with torch.no_grad():
# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# incremental_prefilling when using 40G GPU for vl2-small
inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
    input_ids=prepare_inputs.input_ids,
    images=prepare_inputs.images,
    images_seq_mask=prepare_inputs.images_seq_mask,
    images_spatial_crop=prepare_inputs.images_spatial_crop,
    attention_mask=prepare_inputs.attention_mask,
    chunk_size=512 # prefilling size
)

# run the model to get the response
outputs = vl_gpt.generate(
    inputs_embeds=inputs_embeds,
    input_ids=prepare_inputs.input_ids,
    images=prepare_inputs.images,
    images_seq_mask=prepare_inputs.images_seq_mask,
    images_spatial_crop=prepare_inputs.images_spatial_crop,
    attention_mask=prepare_inputs.attention_mask,
    past_key_values=past_key_values,

    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,

    do_sample=False,
    use_cache=True,
)

answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)

print(f"{prepare_inputs['sft_format'][0]}", answer)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions