# 1. Install transformers 4.51.1 FIRST (critical!)
!pip install transformers==4.51.1 -q
# 2. Install torch 2.6.0
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118 -q
# 3. Install vLLM nightly (which now includes DeepSeek-OCR support as of 2025/10/23)
!pip install --pre vllm --extra-index-url https://wheels.vllm.ai/nightly -q
# 4. Pillow
!pip install pillow -q
# Optional: flash-attn (skip if fails)
try:
!pip install flash-attn==2.7.3 --no-build-isolation -q
except:
print("flash-attn skipped")
from vllm import LLM, SamplingParams
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
from PIL import Image
# Load model (this may take 2-5 mins on Kaggle due to download + GPU setup)
llm = LLM(
model="deepseek-ai/DeepSeek-OCR",
enable_prefix_caching=False,
mm_processor_cache_gb=0,
logits_processors=[NGramPerReqLogitsProcessor],
max_model_len=8192, # required for long OCR outputs
trust_remote_code=True,
dtype="bfloat16", # reduces VRAM usage; Kaggle T4 supports it
)
# Load your image
image = Image.open("/kaggle/working/test_image.jpg").convert("RGB")
# Set prompt (choose one based on your need)
prompt = "<image>\n<|grounding|>Convert the document to markdown."
model_input = [{
"prompt": prompt,
"multi_modal_data": {"image": image}
}]
sampling_params = SamplingParams(
temperature=0.0, # deterministic output
max_tokens=8192,
extra_args=dict(
ngram_size=30,
window_size=90,
whitelist_token_ids={128821, 128822}, # <td>, </td> tokens
),
skip_special_tokens=False,
)
# Generate
outputs = llm.generate(model_input, sampling_params)
# Print result
print("\n" + "="*50 + "\nOCR OUTPUT:\n" + "="*50)
print(outputs[0].outputs[0].text)