Skip to content

ImportError: cannot import name 'GenerationMixin' from 'transformers.generation' (/usr/local/lib/python3.11/dist-packages/transformers/generation/__init__.py) #237

@omanandswami2005

Description

@omanandswami2005
# 1. Install transformers 4.51.1 FIRST (critical!)
!pip install transformers==4.51.1 -q

# 2. Install torch 2.6.0
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118 -q

# 3. Install vLLM nightly (which now includes DeepSeek-OCR support as of 2025/10/23)
!pip install --pre vllm --extra-index-url https://wheels.vllm.ai/nightly -q

# 4. Pillow
!pip install pillow -q

# Optional: flash-attn (skip if fails)
try:
    !pip install flash-attn==2.7.3 --no-build-isolation -q
except:
    print("flash-attn skipped")

from vllm import LLM, SamplingParams
from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
from PIL import Image

# Load model (this may take 2-5 mins on Kaggle due to download + GPU setup)
llm = LLM(
    model="deepseek-ai/DeepSeek-OCR",
    enable_prefix_caching=False,
    mm_processor_cache_gb=0,
    logits_processors=[NGramPerReqLogitsProcessor],
    max_model_len=8192,  # required for long OCR outputs
    trust_remote_code=True,
    dtype="bfloat16",    # reduces VRAM usage; Kaggle T4 supports it
)

# Load your image
image = Image.open("/kaggle/working/test_image.jpg").convert("RGB")

# Set prompt (choose one based on your need)
prompt = "<image>\n<|grounding|>Convert the document to markdown."

model_input = [{
    "prompt": prompt,
    "multi_modal_data": {"image": image}
}]

sampling_params = SamplingParams(
    temperature=0.0,        # deterministic output
    max_tokens=8192,
    extra_args=dict(
        ngram_size=30,
        window_size=90,
        whitelist_token_ids={128821, 128822},  # <td>, </td> tokens
    ),
    skip_special_tokens=False,
)

# Generate
outputs = llm.generate(model_input, sampling_params)

# Print result
print("\n" + "="*50 + "\nOCR OUTPUT:\n" + "="*50)
print(outputs[0].outputs[0].text)

did as same described in documentation, still not working ? was testing in kaggle btw

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions